#install.packages("tidyverse")
#install.packages("forecast")
#install.packages("ggcorrplot")
library(tidyverse)
## ── Attaching packages ──────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggcorrplot)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
vehicles <- read_csv("vehicles.csv")
## Parsed with column specification:
## cols(
## MPG = col_double(),
## Cylinders = col_double(),
## Engine_Size = col_double(),
## Horse_Power = col_double(),
## Vweight = col_double(),
## Acceleration = col_double(),
## Origin = col_double()
## )
#Data from data mining for the masses
View(vehicles)
#Question #1 - Partition your data #60% training, 40% validation split.
#1 of 237 rows, and 1 of 158 rows.
set.seed(1)
#row.names(vehicles)
#nrow(vehicles)
training_set_rows_vehicles <- sample(rownames(vehicles), nrow(vehicles)*0.6)
validation_set_rows <- sample(setdiff(rownames(vehicles), training_set_rows_vehicles), nrow(vehicles)*0.4)
#View(training_set_rows)
#View(validation_set_rows)
training_data <- vehicles[training_set_rows_vehicles, ]
validation_data <- vehicles[validation_set_rows, ]
View(training_data)
View(validation_data)
#MODELING #Question #2 - Create the model and identify where it can be improved. Write a brief summary of 2-3 sentences on what you see with the model summary output.
training_lm_model <- lm(MPG ~ ., data = vehicles, subset = training_set_rows_vehicles)
#iew(training_lm_model)
training_residuals <- data.frame(training_data$MPG, training_lm_model$fitted.values, training_lm_model$residuals)
View(training_residuals)
summary(training_lm_model)
##
## Call:
## lm(formula = MPG ~ ., data = vehicles, subset = training_set_rows_vehicles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6515 -3.1288 -0.2349 2.3266 13.6203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.046228 3.639154 12.103 < 2e-16 ***
## Cylinders -0.989061 0.540854 -1.829 0.06874 .
## Engine_Size 0.026204 0.013913 1.883 0.06090 .
## Horse_Power -0.072093 0.024087 -2.993 0.00306 **
## Vweight -0.005034 0.001287 -3.912 0.00012 ***
## Acceleration -0.061426 0.160098 -0.384 0.70157
## Origin 2.001437 0.465542 4.299 2.53e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.278 on 230 degrees of freedom
## Multiple R-squared: 0.7187, Adjusted R-squared: 0.7114
## F-statistic: 97.95 on 6 and 230 DF, p-value: < 2.2e-16
print(training_residuals)
## training_data.MPG training_lm_model.fitted.values
## 324 43.4 29.779739
## 167 20.0 23.330437
## 129 32.0 32.735700
## 299 34.2 27.909927
## 270 23.8 24.466147
## 187 15.5 16.650151
## 307 41.5 29.485157
## 85 13.0 15.977608
## 277 29.5 31.991941
## 362 22.4 20.073369
## 330 29.8 31.727043
## 263 17.5 14.993375
## 329 33.8 31.901490
## 79 28.0 29.440667
## 213 13.0 15.890179
## 37 14.0 12.820810
## 105 18.0 24.021158
## 217 25.5 25.836256
## 366 28.0 24.363632
## 165 29.0 31.316702
## 290 18.5 16.120834
## 387 26.0 25.642214
## 89 13.0 14.592433
## 289 19.2 17.049818
## 340 30.0 26.773823
## 326 30.0 25.387682
## 386 38.0 24.628074
## 42 13.0 9.386870
## 111 21.0 23.156082
## 20 25.0 26.937677
## 44 22.0 27.279475
## 343 35.1 34.041812
## 70 15.0 14.925750
## 121 20.0 24.447962
## 40 12.0 9.542998
## 172 18.0 21.687982
## 25 11.0 8.438742
## 248 19.4 16.760942
## 198 18.0 21.758364
## 39 14.0 14.234889
## 298 23.9 19.878745
## 280 22.3 23.803844
## 160 15.0 18.998501
## 14 18.0 23.417495
## 130 25.0 26.511443
## 45 19.0 22.010898
## 22 21.0 24.587191
## 206 13.0 15.007964
## 230 16.0 13.876377
## 193 29.0 28.961353
## 104 12.0 10.912493
## 371 27.0 24.685233
## 255 19.4 22.487489
## 345 37.0 32.501068
## 346 37.7 32.573584
## 103 13.0 11.084433
## 331 32.7 23.651937
## 13 22.0 23.238450
## 296 23.0 17.592225
## 379 36.0 31.129706
## 176 23.0 24.962319
## 279 19.8 23.055548
## 110 19.0 26.394661
## 84 14.0 16.094735
## 29 25.0 30.129997
## 141 26.0 28.541691
## 252 20.2 23.328830
## 221 15.5 14.330846
## 108 22.0 29.157315
## 304 28.8 22.597639
## 33 17.0 21.743570
## 351 33.0 29.611908
## 149 31.0 30.281208
## 287 16.9 13.267182
## 102 12.0 11.110945
## 145 24.0 29.479711
## 118 19.0 23.798478
## 327 44.6 33.487390
## 107 21.0 27.284003
## 64 17.0 16.094735
## 224 20.5 20.315924
## 341 39.1 34.109620
## 51 31.0 33.175717
## 384 38.0 32.609985
## 376 31.0 32.577753
## 138 14.0 12.903930
## 394 32.0 27.306769
## 393 44.0 30.651556
## 282 20.6 20.143891
## 143 32.0 32.620641
## 285 16.5 16.662315
## 170 25.0 29.127707
## 48 28.0 29.096248
## 204 26.5 26.820783
## 295 25.4 23.342055
## 24 10.0 8.809495
## 181 25.0 29.078620
## 214 31.5 32.328324
## 338 25.8 25.472154
## 225 19.0 19.437980
## 163 20.0 20.025531
## 43 18.0 23.202733
## 1 18.0 18.430276
## 332 23.7 28.757344
## 78 22.0 26.048213
## 284 17.6 17.172757
## 116 24.0 29.909308
## 233 26.0 30.708334
## 61 14.0 13.187811
## 86 14.0 14.932549
## 334 23.6 23.696327
## 49 30.0 29.477393
## 242 43.1 31.676903
## 246 36.1 34.084053
## 247 19.9 19.125322
## 239 22.0 25.886553
## 219 17.5 15.372834
## 135 13.0 11.945572
## 368 34.0 25.519129
## 367 27.0 24.248856
## 314 19.1 21.351044
## 53 27.0 31.908966
## 352 34.5 29.088604
## 65 11.0 10.381561
## 380 36.0 31.499717
## 124 21.0 23.059370
## 77 26.0 29.508150
## 218 33.5 32.451340
## 98 18.0 24.097848
## 194 24.5 28.081973
## 19 24.0 27.284147
## 31 19.0 24.924336
## 174 19.0 22.494740
## 237 30.0 32.805269
## 75 22.0 28.037542
## 16 27.0 30.678035
## 375 37.0 32.264008
## 92 13.0 9.651650
## 122 11.0 15.208322
## 152 15.0 22.905803
## 311 28.0 25.064331
## 207 19.0 23.085575
## 244 32.8 33.204508
## 229 15.5 12.396186
## 249 20.2 17.269004
## 354 32.4 30.654659
## 253 25.1 24.776395
## 395 28.0 25.182847
## 223 17.5 20.005790
## 377 38.0 28.700019
## 313 24.3 23.207029
## 140 26.0 30.498192
## 126 15.0 21.616190
## 353 33.7 31.480688
## 348 34.7 28.234641
## 312 26.4 23.855390
## 261 17.7 16.104622
## 15 21.0 25.219532
## 274 21.6 23.937420
## 273 17.0 20.731091
## 195 29.0 30.780908
## 201 29.5 31.578946
## 322 40.8 31.833715
## 17 26.0 32.821101
## 212 13.0 16.272400
## 127 31.0 32.350084
## 133 18.0 19.236808
## 41 13.0 11.730875
## 355 32.9 27.929319
## 328 40.9 31.333436
## 159 16.0 18.339308
## 117 20.0 26.660975
## 72 13.0 13.355471
## 36 14.0 13.484572
## 306 33.5 25.881229
## 250 19.2 19.620864
## 157 14.0 12.388833
## 382 38.0 32.834727
## 317 31.3 29.959516
## 310 37.2 32.490040
## 275 16.2 18.659929
## 232 24.5 24.927100
## 106 20.0 29.651497
## 185 17.5 14.016077
## 88 12.0 9.465821
## 297 27.2 25.086069
## 216 36.0 31.651353
## 234 25.5 24.503528
## 50 30.0 29.633113
## 80 23.0 28.738976
## 30 25.0 26.775739
## 93 12.0 8.236361
## 238 30.5 29.120041
## 316 29.8 28.517022
## 254 20.5 20.475565
## 292 34.1 32.785263
## 164 13.0 20.057858
## 168 23.0 25.951306
## 308 38.1 33.038449
## 203 28.0 31.372681
## 184 27.0 28.730268
## 241 21.5 26.726719
## 100 26.0 32.211436
## 113 15.0 15.504378
## 363 26.6 19.816825
## 73 14.0 14.269115
## 27 27.0 30.678035
## 283 17.0 16.477473
## 281 20.2 22.149173
## 228 15.5 13.382473
## 38 14.0 14.560648
## 62 15.0 14.007836
## 134 16.0 14.248580
## 132 16.0 19.553295
## 35 18.0 21.478300
## 125 19.0 23.395883
## 99 23.0 22.850297
## 392 27.0 24.555891
## 272 20.3 23.887037
## 71 13.0 15.317014
## 153 15.0 24.377357
## 265 27.5 28.996598
## 381 34.0 31.537591
## 28 28.0 26.921733
## 391 27.0 23.645844
## 148 26.0 29.230422
## 293 35.7 28.366642
## 231 29.0 30.354060
## 369 31.0 24.939793
## 60 13.0 13.157339
## 173 29.0 30.793193
## 235 30.5 28.747850
## 175 23.0 25.773531
## 12 24.0 29.343623
## 205 20.0 23.323277
## 344 32.3 32.316524
## 378 36.0 27.852225
## training_lm_model.residuals
## 324 13.62026091
## 167 -3.33043657
## 129 -0.73569991
## 299 6.29007307
## 270 -0.66614733
## 187 -1.15015112
## 307 12.01484283
## 85 -2.97760820
## 277 -2.49194124
## 362 2.32663087
## 330 -1.92704327
## 263 2.50662521
## 329 1.89851016
## 79 -1.44066656
## 213 -2.89017900
## 37 1.17919000
## 105 -6.02115775
## 217 -0.33625579
## 366 3.63636809
## 165 -2.31670154
## 290 2.37916570
## 387 0.35778629
## 89 -1.59243314
## 289 2.15018229
## 340 3.22617748
## 326 4.61231765
## 386 13.37192581
## 42 3.61312973
## 111 -2.15608181
## 20 -1.93767748
## 44 -5.27947520
## 343 1.05818845
## 70 0.07425047
## 121 -4.44796187
## 40 2.45700247
## 172 -3.68798207
## 25 2.56125794
## 248 2.63905825
## 198 -3.75836353
## 39 -0.23488896
## 298 4.02125550
## 280 -1.50384376
## 160 -3.99850104
## 14 -5.41749468
## 130 -1.51144340
## 45 -3.01089829
## 22 -3.58719121
## 206 -2.00796367
## 230 2.12362319
## 193 0.03864689
## 104 1.08750681
## 371 2.31476740
## 255 -3.08748929
## 345 4.49893206
## 346 5.12641569
## 103 1.91556694
## 331 9.04806337
## 13 -1.23845000
## 296 5.40777499
## 379 4.87029439
## 176 -1.96231858
## 279 -3.25554794
## 110 -7.39466121
## 84 -2.09473479
## 29 -5.12999731
## 141 -2.54169114
## 252 -3.12883008
## 221 1.16915400
## 108 -7.15731471
## 304 6.20236143
## 33 -4.74357026
## 351 3.38809238
## 149 0.71879217
## 287 3.63281781
## 102 0.88905531
## 145 -5.47971118
## 118 -4.79847757
## 327 11.11261024
## 107 -6.28400255
## 64 0.90526521
## 224 0.18407615
## 341 4.99037964
## 51 -2.17571716
## 384 5.39001544
## 376 -1.57775323
## 138 1.09607041
## 394 4.69323097
## 393 13.34844395
## 282 0.45610867
## 143 -0.62064140
## 285 -0.16231536
## 170 -4.12770740
## 48 -1.09624825
## 204 -0.32078299
## 295 2.05794531
## 24 1.19050529
## 181 -4.07862018
## 214 -0.82832353
## 338 0.32784622
## 225 -0.43798001
## 163 -0.02553069
## 43 -5.20273339
## 1 -0.43027619
## 332 -5.05734356
## 78 -4.04821306
## 284 0.42724257
## 116 -5.90930799
## 233 -4.70833390
## 61 0.81218881
## 86 -0.93254911
## 334 -0.09632698
## 49 0.52260720
## 242 11.42309741
## 246 2.01594657
## 247 0.77467845
## 239 -3.88655284
## 219 2.12716649
## 135 1.05442767
## 368 8.48087062
## 367 2.75114433
## 314 -2.25104356
## 53 -4.90896605
## 352 5.41139595
## 65 0.61843940
## 380 4.50028304
## 124 -2.05936973
## 77 -3.50814950
## 218 1.04866012
## 98 -6.09784788
## 194 -3.58197322
## 19 -3.28414721
## 31 -5.92433601
## 174 -3.49474020
## 237 -2.80526937
## 75 -6.03754213
## 16 -3.67803486
## 375 4.73599229
## 92 3.34834961
## 122 -4.20832229
## 152 -7.90580319
## 311 2.93566946
## 207 -4.08557530
## 244 -0.40450833
## 229 3.10381432
## 249 2.93099568
## 354 1.74534120
## 253 0.32360523
## 395 2.81715257
## 223 -2.50578952
## 377 9.29998148
## 313 1.09297084
## 140 -4.49819227
## 126 -6.61619027
## 353 2.21931221
## 348 6.46535904
## 312 2.54461012
## 261 1.59537831
## 15 -4.21953246
## 274 -2.33742014
## 273 -3.73109105
## 195 -1.78090801
## 201 -2.07894628
## 322 8.96628524
## 17 -6.82110053
## 212 -3.27239983
## 127 -1.35008405
## 133 -1.23680844
## 41 1.26912509
## 355 4.97068139
## 328 9.56656379
## 159 -2.33930823
## 117 -6.66097527
## 72 -0.35547085
## 36 0.51542781
## 306 7.61877083
## 250 -0.42086384
## 157 1.61116696
## 382 5.16527309
## 317 1.34048356
## 310 4.70996048
## 275 -2.45992865
## 232 -0.42710035
## 106 -9.65149695
## 185 3.48392260
## 88 2.53417850
## 297 2.11393096
## 216 4.34864674
## 234 0.99647177
## 50 0.36688735
## 80 -5.73897638
## 30 -1.77573905
## 93 3.76363943
## 238 1.37995886
## 316 1.28297754
## 254 0.02443506
## 292 1.31473710
## 164 -7.05785778
## 168 -2.95130563
## 308 5.06155085
## 203 -3.37268075
## 184 -1.73026764
## 241 -5.22671921
## 100 -6.21143611
## 113 -0.50437847
## 363 6.78317528
## 73 -0.26911545
## 27 -3.67803486
## 283 0.52252669
## 281 -1.94917324
## 228 2.11752749
## 38 -0.56064828
## 62 0.99216420
## 134 1.75141961
## 132 -3.55329453
## 35 -3.47829953
## 125 -4.39588324
## 99 0.14970264
## 392 2.44410871
## 272 -3.58703717
## 71 -2.31701441
## 153 -9.37735699
## 265 -1.49659760
## 381 2.46240889
## 28 1.07826714
## 391 3.35415612
## 148 -3.23042217
## 293 7.33335780
## 231 -1.35406032
## 369 6.06020748
## 60 -0.15733886
## 173 -1.79319327
## 235 1.75215036
## 175 -2.77353136
## 12 -5.34362332
## 205 -3.32327735
## 344 -0.01652381
## 378 8.14777497
# The model below creates multiple linear regression model under the name of Training residual which is the predicted value minus the actual value in order for us to perform different models with our result, we can also see how it generates a standard error for our predictions.
#Question 3 - Predict using the validation data and your model for comparison purposes and compare the results from that to the actual values to see what the residuals are.
prediction_lm_model_vehicles <- predict(training_lm_model, newdata = validation_data)
validation_residuals_vehicles <- data.frame(validation_data$'MPG' - prediction_lm_model_vehicles, residuals = validation_data$'MPG' - prediction_lm_model_vehicles)
ggplot(data = training_lm_model) +
aes(x = training_lm_model$residuals) +
geom_histogram(bins = 20)
ggplot(data = validation_residuals_vehicles) +
aes(x = residuals) +
geom_histogram(bins = 20)
#Question #4 - Change the model formula to not include Acceleration to see what the impact is, if any. Describe what has changed in the summary results between your previous model and this new one (1-2 sentences).
training_lm_model2 <- lm(MPG ~ Cylinders + Engine_Size + Horse_Power + Vweight + Origin, data = vehicles, subset = training_set_rows_vehicles)
summary(training_lm_model2)
##
## Call:
## lm(formula = MPG ~ Cylinders + Engine_Size + Horse_Power + Vweight +
## Origin, data = vehicles, subset = training_set_rows_vehicles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8514 -3.0766 -0.1866 2.3508 13.3867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.999302 2.403441 17.891 < 2e-16 ***
## Cylinders -0.987226 0.539833 -1.829 0.068725 .
## Engine_Size 0.026878 0.013776 1.951 0.052256 .
## Horse_Power -0.066110 0.018323 -3.608 0.000378 ***
## Vweight -0.005260 0.001142 -4.605 6.81e-06 ***
## Origin 1.999769 0.464661 4.304 2.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.27 on 231 degrees of freedom
## Multiple R-squared: 0.7185, Adjusted R-squared: 0.7124
## F-statistic: 117.9 on 5 and 231 DF, p-value: < 2.2e-16
#We can see how the following increase, Residual standard, degrees of freedom, Multiple R-squared, Adjusted R-squared, F-statisticand the p-value as we change the model formula.
#Question #5 - Using one of the models that you created, and the corresponding multiple linear regression formula, predict the MPG of a vehicle with the following characteristics: #4 cylinders #120 engine size #130 horse power #2200 vweight #3 origin
MPG1 <- (-0.989061*4) + (0.026204*120) + (-0.072093*130) + (-0.005034*2200) + (2.001437*3) + 44.045228
print(MPG1)
## [1] 28.79089
Part 2 - Classification Model
Question #1 - Using the pima native americans dataset, make a naive bayes model but with the NA values present instead of median imputation. Make a confusion matrix for the validation set. Answer the following - which provides better positive predictive value of your two models (the median or the NA)? No explanation needed - just answer that question. You should include the code that you made that led to the answer from above.
pima <- read_csv("pima_diabetes.csv")
## Parsed with column specification:
## cols(
## Pregnancies = col_double(),
## Glucose = col_double(),
## BloodPressure = col_double(),
## SkinThickness = col_double(),
## Insulin = col_double(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_double(),
## Outcome = col_double()
## )
view(pima)
pima_cleaned_na <- pima %>%
mutate(Glucose = ifelse(Glucose == 0, NA, Glucose),
BloodPressure = ifelse(BloodPressure == 0, NA, BloodPressure),
SkinThickness = ifelse(SkinThickness == 0, NA, SkinThickness),
Insulin = ifelse(Insulin == 0, NA, Insulin),
BMI = ifelse(BMI == 0, NA, BMI))
pima_cleaned_na$Outcome <- factor(pima_cleaned_na$Outcome, levels = c(1, 0), labels = c("Positive", "Negative"))
set.seed(1)
training_partition_na <- createDataPartition(y = pima_cleaned_na$Outcome, p = 0.6, list = FALSE)
training_pima_new_na <- pima_cleaned_na[training_partition_na, ]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
validation_pima_new_na <- pima_cleaned_na[-training_partition_na, ]
prop.table(table(pima_cleaned_na$Outcome)) * 100
##
## Positive Negative
## 34.89583 65.10417
prop.table(table(training_pima_new_na$Outcome)) *100
##
## Positive Negative
## 34.92408 65.07592
prop.table(table(validation_pima_new_na$Outcome)) *100
##
## Positive Negative
## 34.85342 65.14658
predictors_na <- training_pima_new_na[, 1:8]
response_na <- training_pima_new_na$Outcome
The Median provides a better positive predictive value.