Group Assignment 5

#install.packages("tidyverse")
#install.packages("forecast")
#install.packages("ggcorrplot")
library(tidyverse)

## ── Attaching packages ──────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggcorrplot)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(e1071)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

vehicles <- read_csv("vehicles.csv")

## Parsed with column specification:
## cols(
##   MPG = col_double(),
##   Cylinders = col_double(),
##   Engine_Size = col_double(),
##   Horse_Power = col_double(),
##   Vweight = col_double(),
##   Acceleration = col_double(),
##   Origin = col_double()
## )

#Data from data mining for the masses

View(vehicles)

#Question #1 - Partition your data #60% training, 40% validation split.

#1 of 237 rows, and 1 of 158 rows.

set.seed(1)

#row.names(vehicles)

#nrow(vehicles)

training_set_rows_vehicles <- sample(rownames(vehicles), nrow(vehicles)*0.6)

validation_set_rows <- sample(setdiff(rownames(vehicles), training_set_rows_vehicles), nrow(vehicles)*0.4)

#View(training_set_rows)

#View(validation_set_rows)


training_data <- vehicles[training_set_rows_vehicles, ]
validation_data <- vehicles[validation_set_rows, ]

View(training_data)
View(validation_data)

#MODELING #Question #2 - Create the model and identify where it can be improved. Write a brief summary of 2-3 sentences on what you see with the model summary output.

training_lm_model <- lm(MPG ~ ., data = vehicles, subset = training_set_rows_vehicles)

#iew(training_lm_model)

training_residuals <- data.frame(training_data$MPG, training_lm_model$fitted.values, training_lm_model$residuals)


View(training_residuals)
summary(training_lm_model)

## 
## Call:
## lm(formula = MPG ~ ., data = vehicles, subset = training_set_rows_vehicles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6515 -3.1288 -0.2349  2.3266 13.6203 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.046228   3.639154  12.103  < 2e-16 ***
## Cylinders    -0.989061   0.540854  -1.829  0.06874 .  
## Engine_Size   0.026204   0.013913   1.883  0.06090 .  
## Horse_Power  -0.072093   0.024087  -2.993  0.00306 ** 
## Vweight      -0.005034   0.001287  -3.912  0.00012 ***
## Acceleration -0.061426   0.160098  -0.384  0.70157    
## Origin        2.001437   0.465542   4.299 2.53e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.278 on 230 degrees of freedom
## Multiple R-squared:  0.7187, Adjusted R-squared:  0.7114 
## F-statistic: 97.95 on 6 and 230 DF,  p-value: < 2.2e-16

print(training_residuals)

##     training_data.MPG training_lm_model.fitted.values
## 324              43.4                       29.779739
## 167              20.0                       23.330437
## 129              32.0                       32.735700
## 299              34.2                       27.909927
## 270              23.8                       24.466147
## 187              15.5                       16.650151
## 307              41.5                       29.485157
## 85               13.0                       15.977608
## 277              29.5                       31.991941
## 362              22.4                       20.073369
## 330              29.8                       31.727043
## 263              17.5                       14.993375
## 329              33.8                       31.901490
## 79               28.0                       29.440667
## 213              13.0                       15.890179
## 37               14.0                       12.820810
## 105              18.0                       24.021158
## 217              25.5                       25.836256
## 366              28.0                       24.363632
## 165              29.0                       31.316702
## 290              18.5                       16.120834
## 387              26.0                       25.642214
## 89               13.0                       14.592433
## 289              19.2                       17.049818
## 340              30.0                       26.773823
## 326              30.0                       25.387682
## 386              38.0                       24.628074
## 42               13.0                        9.386870
## 111              21.0                       23.156082
## 20               25.0                       26.937677
## 44               22.0                       27.279475
## 343              35.1                       34.041812
## 70               15.0                       14.925750
## 121              20.0                       24.447962
## 40               12.0                        9.542998
## 172              18.0                       21.687982
## 25               11.0                        8.438742
## 248              19.4                       16.760942
## 198              18.0                       21.758364
## 39               14.0                       14.234889
## 298              23.9                       19.878745
## 280              22.3                       23.803844
## 160              15.0                       18.998501
## 14               18.0                       23.417495
## 130              25.0                       26.511443
## 45               19.0                       22.010898
## 22               21.0                       24.587191
## 206              13.0                       15.007964
## 230              16.0                       13.876377
## 193              29.0                       28.961353
## 104              12.0                       10.912493
## 371              27.0                       24.685233
## 255              19.4                       22.487489
## 345              37.0                       32.501068
## 346              37.7                       32.573584
## 103              13.0                       11.084433
## 331              32.7                       23.651937
## 13               22.0                       23.238450
## 296              23.0                       17.592225
## 379              36.0                       31.129706
## 176              23.0                       24.962319
## 279              19.8                       23.055548
## 110              19.0                       26.394661
## 84               14.0                       16.094735
## 29               25.0                       30.129997
## 141              26.0                       28.541691
## 252              20.2                       23.328830
## 221              15.5                       14.330846
## 108              22.0                       29.157315
## 304              28.8                       22.597639
## 33               17.0                       21.743570
## 351              33.0                       29.611908
## 149              31.0                       30.281208
## 287              16.9                       13.267182
## 102              12.0                       11.110945
## 145              24.0                       29.479711
## 118              19.0                       23.798478
## 327              44.6                       33.487390
## 107              21.0                       27.284003
## 64               17.0                       16.094735
## 224              20.5                       20.315924
## 341              39.1                       34.109620
## 51               31.0                       33.175717
## 384              38.0                       32.609985
## 376              31.0                       32.577753
## 138              14.0                       12.903930
## 394              32.0                       27.306769
## 393              44.0                       30.651556
## 282              20.6                       20.143891
## 143              32.0                       32.620641
## 285              16.5                       16.662315
## 170              25.0                       29.127707
## 48               28.0                       29.096248
## 204              26.5                       26.820783
## 295              25.4                       23.342055
## 24               10.0                        8.809495
## 181              25.0                       29.078620
## 214              31.5                       32.328324
## 338              25.8                       25.472154
## 225              19.0                       19.437980
## 163              20.0                       20.025531
## 43               18.0                       23.202733
## 1                18.0                       18.430276
## 332              23.7                       28.757344
## 78               22.0                       26.048213
## 284              17.6                       17.172757
## 116              24.0                       29.909308
## 233              26.0                       30.708334
## 61               14.0                       13.187811
## 86               14.0                       14.932549
## 334              23.6                       23.696327
## 49               30.0                       29.477393
## 242              43.1                       31.676903
## 246              36.1                       34.084053
## 247              19.9                       19.125322
## 239              22.0                       25.886553
## 219              17.5                       15.372834
## 135              13.0                       11.945572
## 368              34.0                       25.519129
## 367              27.0                       24.248856
## 314              19.1                       21.351044
## 53               27.0                       31.908966
## 352              34.5                       29.088604
## 65               11.0                       10.381561
## 380              36.0                       31.499717
## 124              21.0                       23.059370
## 77               26.0                       29.508150
## 218              33.5                       32.451340
## 98               18.0                       24.097848
## 194              24.5                       28.081973
## 19               24.0                       27.284147
## 31               19.0                       24.924336
## 174              19.0                       22.494740
## 237              30.0                       32.805269
## 75               22.0                       28.037542
## 16               27.0                       30.678035
## 375              37.0                       32.264008
## 92               13.0                        9.651650
## 122              11.0                       15.208322
## 152              15.0                       22.905803
## 311              28.0                       25.064331
## 207              19.0                       23.085575
## 244              32.8                       33.204508
## 229              15.5                       12.396186
## 249              20.2                       17.269004
## 354              32.4                       30.654659
## 253              25.1                       24.776395
## 395              28.0                       25.182847
## 223              17.5                       20.005790
## 377              38.0                       28.700019
## 313              24.3                       23.207029
## 140              26.0                       30.498192
## 126              15.0                       21.616190
## 353              33.7                       31.480688
## 348              34.7                       28.234641
## 312              26.4                       23.855390
## 261              17.7                       16.104622
## 15               21.0                       25.219532
## 274              21.6                       23.937420
## 273              17.0                       20.731091
## 195              29.0                       30.780908
## 201              29.5                       31.578946
## 322              40.8                       31.833715
## 17               26.0                       32.821101
## 212              13.0                       16.272400
## 127              31.0                       32.350084
## 133              18.0                       19.236808
## 41               13.0                       11.730875
## 355              32.9                       27.929319
## 328              40.9                       31.333436
## 159              16.0                       18.339308
## 117              20.0                       26.660975
## 72               13.0                       13.355471
## 36               14.0                       13.484572
## 306              33.5                       25.881229
## 250              19.2                       19.620864
## 157              14.0                       12.388833
## 382              38.0                       32.834727
## 317              31.3                       29.959516
## 310              37.2                       32.490040
## 275              16.2                       18.659929
## 232              24.5                       24.927100
## 106              20.0                       29.651497
## 185              17.5                       14.016077
## 88               12.0                        9.465821
## 297              27.2                       25.086069
## 216              36.0                       31.651353
## 234              25.5                       24.503528
## 50               30.0                       29.633113
## 80               23.0                       28.738976
## 30               25.0                       26.775739
## 93               12.0                        8.236361
## 238              30.5                       29.120041
## 316              29.8                       28.517022
## 254              20.5                       20.475565
## 292              34.1                       32.785263
## 164              13.0                       20.057858
## 168              23.0                       25.951306
## 308              38.1                       33.038449
## 203              28.0                       31.372681
## 184              27.0                       28.730268
## 241              21.5                       26.726719
## 100              26.0                       32.211436
## 113              15.0                       15.504378
## 363              26.6                       19.816825
## 73               14.0                       14.269115
## 27               27.0                       30.678035
## 283              17.0                       16.477473
## 281              20.2                       22.149173
## 228              15.5                       13.382473
## 38               14.0                       14.560648
## 62               15.0                       14.007836
## 134              16.0                       14.248580
## 132              16.0                       19.553295
## 35               18.0                       21.478300
## 125              19.0                       23.395883
## 99               23.0                       22.850297
## 392              27.0                       24.555891
## 272              20.3                       23.887037
## 71               13.0                       15.317014
## 153              15.0                       24.377357
## 265              27.5                       28.996598
## 381              34.0                       31.537591
## 28               28.0                       26.921733
## 391              27.0                       23.645844
## 148              26.0                       29.230422
## 293              35.7                       28.366642
## 231              29.0                       30.354060
## 369              31.0                       24.939793
## 60               13.0                       13.157339
## 173              29.0                       30.793193
## 235              30.5                       28.747850
## 175              23.0                       25.773531
## 12               24.0                       29.343623
## 205              20.0                       23.323277
## 344              32.3                       32.316524
## 378              36.0                       27.852225
##     training_lm_model.residuals
## 324                 13.62026091
## 167                 -3.33043657
## 129                 -0.73569991
## 299                  6.29007307
## 270                 -0.66614733
## 187                 -1.15015112
## 307                 12.01484283
## 85                  -2.97760820
## 277                 -2.49194124
## 362                  2.32663087
## 330                 -1.92704327
## 263                  2.50662521
## 329                  1.89851016
## 79                  -1.44066656
## 213                 -2.89017900
## 37                   1.17919000
## 105                 -6.02115775
## 217                 -0.33625579
## 366                  3.63636809
## 165                 -2.31670154
## 290                  2.37916570
## 387                  0.35778629
## 89                  -1.59243314
## 289                  2.15018229
## 340                  3.22617748
## 326                  4.61231765
## 386                 13.37192581
## 42                   3.61312973
## 111                 -2.15608181
## 20                  -1.93767748
## 44                  -5.27947520
## 343                  1.05818845
## 70                   0.07425047
## 121                 -4.44796187
## 40                   2.45700247
## 172                 -3.68798207
## 25                   2.56125794
## 248                  2.63905825
## 198                 -3.75836353
## 39                  -0.23488896
## 298                  4.02125550
## 280                 -1.50384376
## 160                 -3.99850104
## 14                  -5.41749468
## 130                 -1.51144340
## 45                  -3.01089829
## 22                  -3.58719121
## 206                 -2.00796367
## 230                  2.12362319
## 193                  0.03864689
## 104                  1.08750681
## 371                  2.31476740
## 255                 -3.08748929
## 345                  4.49893206
## 346                  5.12641569
## 103                  1.91556694
## 331                  9.04806337
## 13                  -1.23845000
## 296                  5.40777499
## 379                  4.87029439
## 176                 -1.96231858
## 279                 -3.25554794
## 110                 -7.39466121
## 84                  -2.09473479
## 29                  -5.12999731
## 141                 -2.54169114
## 252                 -3.12883008
## 221                  1.16915400
## 108                 -7.15731471
## 304                  6.20236143
## 33                  -4.74357026
## 351                  3.38809238
## 149                  0.71879217
## 287                  3.63281781
## 102                  0.88905531
## 145                 -5.47971118
## 118                 -4.79847757
## 327                 11.11261024
## 107                 -6.28400255
## 64                   0.90526521
## 224                  0.18407615
## 341                  4.99037964
## 51                  -2.17571716
## 384                  5.39001544
## 376                 -1.57775323
## 138                  1.09607041
## 394                  4.69323097
## 393                 13.34844395
## 282                  0.45610867
## 143                 -0.62064140
## 285                 -0.16231536
## 170                 -4.12770740
## 48                  -1.09624825
## 204                 -0.32078299
## 295                  2.05794531
## 24                   1.19050529
## 181                 -4.07862018
## 214                 -0.82832353
## 338                  0.32784622
## 225                 -0.43798001
## 163                 -0.02553069
## 43                  -5.20273339
## 1                   -0.43027619
## 332                 -5.05734356
## 78                  -4.04821306
## 284                  0.42724257
## 116                 -5.90930799
## 233                 -4.70833390
## 61                   0.81218881
## 86                  -0.93254911
## 334                 -0.09632698
## 49                   0.52260720
## 242                 11.42309741
## 246                  2.01594657
## 247                  0.77467845
## 239                 -3.88655284
## 219                  2.12716649
## 135                  1.05442767
## 368                  8.48087062
## 367                  2.75114433
## 314                 -2.25104356
## 53                  -4.90896605
## 352                  5.41139595
## 65                   0.61843940
## 380                  4.50028304
## 124                 -2.05936973
## 77                  -3.50814950
## 218                  1.04866012
## 98                  -6.09784788
## 194                 -3.58197322
## 19                  -3.28414721
## 31                  -5.92433601
## 174                 -3.49474020
## 237                 -2.80526937
## 75                  -6.03754213
## 16                  -3.67803486
## 375                  4.73599229
## 92                   3.34834961
## 122                 -4.20832229
## 152                 -7.90580319
## 311                  2.93566946
## 207                 -4.08557530
## 244                 -0.40450833
## 229                  3.10381432
## 249                  2.93099568
## 354                  1.74534120
## 253                  0.32360523
## 395                  2.81715257
## 223                 -2.50578952
## 377                  9.29998148
## 313                  1.09297084
## 140                 -4.49819227
## 126                 -6.61619027
## 353                  2.21931221
## 348                  6.46535904
## 312                  2.54461012
## 261                  1.59537831
## 15                  -4.21953246
## 274                 -2.33742014
## 273                 -3.73109105
## 195                 -1.78090801
## 201                 -2.07894628
## 322                  8.96628524
## 17                  -6.82110053
## 212                 -3.27239983
## 127                 -1.35008405
## 133                 -1.23680844
## 41                   1.26912509
## 355                  4.97068139
## 328                  9.56656379
## 159                 -2.33930823
## 117                 -6.66097527
## 72                  -0.35547085
## 36                   0.51542781
## 306                  7.61877083
## 250                 -0.42086384
## 157                  1.61116696
## 382                  5.16527309
## 317                  1.34048356
## 310                  4.70996048
## 275                 -2.45992865
## 232                 -0.42710035
## 106                 -9.65149695
## 185                  3.48392260
## 88                   2.53417850
## 297                  2.11393096
## 216                  4.34864674
## 234                  0.99647177
## 50                   0.36688735
## 80                  -5.73897638
## 30                  -1.77573905
## 93                   3.76363943
## 238                  1.37995886
## 316                  1.28297754
## 254                  0.02443506
## 292                  1.31473710
## 164                 -7.05785778
## 168                 -2.95130563
## 308                  5.06155085
## 203                 -3.37268075
## 184                 -1.73026764
## 241                 -5.22671921
## 100                 -6.21143611
## 113                 -0.50437847
## 363                  6.78317528
## 73                  -0.26911545
## 27                  -3.67803486
## 283                  0.52252669
## 281                 -1.94917324
## 228                  2.11752749
## 38                  -0.56064828
## 62                   0.99216420
## 134                  1.75141961
## 132                 -3.55329453
## 35                  -3.47829953
## 125                 -4.39588324
## 99                   0.14970264
## 392                  2.44410871
## 272                 -3.58703717
## 71                  -2.31701441
## 153                 -9.37735699
## 265                 -1.49659760
## 381                  2.46240889
## 28                   1.07826714
## 391                  3.35415612
## 148                 -3.23042217
## 293                  7.33335780
## 231                 -1.35406032
## 369                  6.06020748
## 60                  -0.15733886
## 173                 -1.79319327
## 235                  1.75215036
## 175                 -2.77353136
## 12                  -5.34362332
## 205                 -3.32327735
## 344                 -0.01652381
## 378                  8.14777497

# The model below creates multiple linear regression model under the name of Training residual which is the predicted value minus the actual value in order for us to perform different models with our result, we can also see how it generates a standard error for our predictions.

#Question 3 - Predict using the validation data and your model for comparison purposes and compare the results from that to the actual values to see what the residuals are.

prediction_lm_model_vehicles <- predict(training_lm_model, newdata = validation_data)

validation_residuals_vehicles <- data.frame(validation_data$'MPG' - prediction_lm_model_vehicles, residuals = validation_data$'MPG' - prediction_lm_model_vehicles)

ggplot(data = training_lm_model) +
  aes(x = training_lm_model$residuals) +
  geom_histogram(bins = 20)

ggplot(data = validation_residuals_vehicles) +
  aes(x = residuals) +
  geom_histogram(bins = 20)

#Question #4 - Change the model formula to not include Acceleration to see what the impact is, if any. Describe what has changed in the summary results between your previous model and this new one (1-2 sentences).

training_lm_model2 <- lm(MPG ~ Cylinders + Engine_Size + Horse_Power + Vweight + Origin, data = vehicles, subset = training_set_rows_vehicles)

summary(training_lm_model2)

## 
## Call:
## lm(formula = MPG ~ Cylinders + Engine_Size + Horse_Power + Vweight + 
##     Origin, data = vehicles, subset = training_set_rows_vehicles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8514 -3.0766 -0.1866  2.3508 13.3867 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.999302   2.403441  17.891  < 2e-16 ***
## Cylinders   -0.987226   0.539833  -1.829 0.068725 .  
## Engine_Size  0.026878   0.013776   1.951 0.052256 .  
## Horse_Power -0.066110   0.018323  -3.608 0.000378 ***
## Vweight     -0.005260   0.001142  -4.605 6.81e-06 ***
## Origin       1.999769   0.464661   4.304 2.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.27 on 231 degrees of freedom
## Multiple R-squared:  0.7185, Adjusted R-squared:  0.7124 
## F-statistic: 117.9 on 5 and 231 DF,  p-value: < 2.2e-16

#We can see how the following increase, Residual standard, degrees of freedom, Multiple R-squared, Adjusted R-squared, F-statisticand the p-value as we change the model formula.

#Question #5 - Using one of the models that you created, and the corresponding multiple linear regression formula, predict the MPG of a vehicle with the following characteristics: #4 cylinders #120 engine size #130 horse power #2200 vweight #3 origin

MPG1 <- (-0.989061*4) + (0.026204*120) + (-0.072093*130) + (-0.005034*2200) + (2.001437*3) + 44.045228

print(MPG1)

## [1] 28.79089

Part 2 - Classification Model

Question #1 - Using the pima native americans dataset, make a naive bayes model but with the NA values present instead of median imputation. Make a confusion matrix for the validation set. Answer the following - which provides better positive predictive value of your two models (the median or the NA)? No explanation needed - just answer that question. You should include the code that you made that led to the answer from above.

pima <- read_csv("pima_diabetes.csv")

## Parsed with column specification:
## cols(
##   Pregnancies = col_double(),
##   Glucose = col_double(),
##   BloodPressure = col_double(),
##   SkinThickness = col_double(),
##   Insulin = col_double(),
##   BMI = col_double(),
##   DiabetesPedigreeFunction = col_double(),
##   Age = col_double(),
##   Outcome = col_double()
## )

view(pima)

pima_cleaned_na <- pima %>%
  mutate(Glucose = ifelse(Glucose == 0, NA, Glucose),
         BloodPressure = ifelse(BloodPressure == 0, NA, BloodPressure),
         SkinThickness = ifelse(SkinThickness == 0, NA, SkinThickness),
         Insulin = ifelse(Insulin == 0, NA, Insulin), 
         BMI = ifelse(BMI == 0, NA, BMI)) 
pima_cleaned_na$Outcome <-  factor(pima_cleaned_na$Outcome, levels = c(1, 0), labels = c("Positive", "Negative")) 
set.seed(1) 
training_partition_na <- createDataPartition(y = pima_cleaned_na$Outcome, p = 0.6, list = FALSE) 
training_pima_new_na <- pima_cleaned_na[training_partition_na, ]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

validation_pima_new_na <- pima_cleaned_na[-training_partition_na, ]  
prop.table(table(pima_cleaned_na$Outcome)) * 100

## 
## Positive Negative 
## 34.89583 65.10417

prop.table(table(training_pima_new_na$Outcome)) *100

## 
## Positive Negative 
## 34.92408 65.07592

prop.table(table(validation_pima_new_na$Outcome)) *100

## 
## Positive Negative 
## 34.85342 65.14658

predictors_na <- training_pima_new_na[, 1:8]
response_na <- training_pima_new_na$Outcome

The Median provides a better positive predictive value.

Group Assignment 5

Vanessa Wasveiler, Keydy Sanchez