#loading the dataset for our prediction
## [1] "X" "Species" "Owner"
## [4] "Country.of.Origin" "Farm.Name" "Lot.Number"
## [7] "Mill" "ICO.Number" "Company"
## [10] "Altitude" "Region" "Producer"
## [13] "Number.of.Bags" "Bag.Weight" "In.Country.Partner"
## [16] "Harvest.Year" "Grading.Date" "Owner.1"
## [19] "Variety" "Processing.Method" "Aroma"
## [22] "Flavor" "Aftertaste" "Acidity"
## [25] "Body" "Balance" "Uniformity"
## [28] "Clean.Cup" "Sweetness" "Cupper.Points"
## [31] "Total.Cup.Points" "Moisture" "Category.One.Defects"
## [34] "Quakers" "Color" "Category.Two.Defects"
## [37] "Expiration" "Certification.Body" "Certification.Address"
## [40] "Certification.Contact" "unit_of_measurement" "altitude_low_meters"
## [43] "altitude_high_meters" "altitude_mean_meters"
this is a very large data set . many things can be done but today we will just predict the coffee total cup points using liner regression
our predictor variable has no missing data. some of the data like altitude_low_meters,altitude_high_meters are missing. if we remove the NA and see the correlation between them and coffee total cup point is decent then we can impute them otherwise its not necessary
library(DataExplorer)
coffee_gap=coffee %>%
filter(!is.na(altitude_mean_meters),!is.na(Quakers))
coffee_gap %>%
select(Total.Cup.Points,altitude_mean_meters,Quakers) %>%
plot_correlation(ggtheme = theme_light(),title = "correlation between Total.Cup.Points vs altitude_mean_meters")as the correlation is very weak we can remove the features that contain na value
coffee_new=coffee %>%
select(-c(altitude_high_meters,altitude_low_meters,altitude_mean_meters,X,Quakers))
#making a task
coffee.task=makeRegrTask(data = coffee_new, target ="Total.Cup.Points")
coffee.task## Supervised task: coffee_new
## Type: regr
## Target: Total.Cup.Points
## Observations: 1311
## Features:
## numerics factors ordered functionals
## 14 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
#making a learner
coffee.learner=makeLearner("regr.lm")
#spliting the dataset
ho = makeResampleInstance("Holdout",coffee.task)
coffee.train = subsetTask(coffee.task,ho$train.inds[[1]])
coffee.test = subsetTask(coffee.task,ho$test.inds[[1]])
coffee.train## Supervised task: coffee_new
## Type: regr
## Target: Total.Cup.Points
## Observations: 874
## Features:
## numerics factors ordered functionals
## 14 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Supervised task: coffee_new
## Type: regr
## Target: Total.Cup.Points
## Observations: 437
## Features:
## numerics factors ordered functionals
## 14 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
#this means the data will look like
library(caTools)
sample=sample.split(coffee_new$Total.Cup.Points,SplitRatio = 2/3)
train=subset(coffee_new,sample==T)
test=subset(coffee_new,sample==F)#automating feature selection
library(FSelectorRcpp)
filtervals=generateFilterValuesData(coffee.train,method = "linear.correlation")
filtervals## FilterValues:
## Task: coffee_new
## name type filter value
## 1: Flavor numeric linear.correlation 0.88699234
## 2: Aftertaste numeric linear.correlation 0.87690312
## 3: Balance numeric linear.correlation 0.84740461
## 4: Aroma numeric linear.correlation 0.82166257
## 5: Body numeric linear.correlation 0.82089294
## 6: Acidity numeric linear.correlation 0.81329005
## 7: Cupper.Points numeric linear.correlation 0.79626669
## 8: Uniformity numeric linear.correlation 0.69345325
## 9: Sweetness numeric linear.correlation 0.67718583
## 10: Clean.Cup numeric linear.correlation 0.67692154
## 11: Category.Two.Defects integer linear.correlation 0.21970485
## 12: Moisture numeric linear.correlation 0.08766368
## 13: Category.One.Defects integer linear.correlation 0.07376721
## 14: Number.of.Bags integer linear.correlation 0.02017343
this shows by using linear correlation we can identify which feature to use for our model
#making a filter wrapper to use them in our hypermeter tuning and sometimes this will work as a new learner
filterwrapper=makeFilterWrapper(learner = coffee.learner, fw.method = "linear.correlation")
getParamSet(filterwrapper)## Type len Def Constr
## fw.method discrete - - anova.test,auc,carscore,cforest.impor...
## fw.base.methods discrete - - anova.test,auc,carscore,cforest.impor...
## fw.perc numeric - - 0 to 1
## fw.abs integer - - 0 to Inf
## fw.threshold numeric - - -Inf to Inf
## fw.fun function - - -
## fw.fun.args untyped - <NULL> -
## fw.mandatory.feat untyped - - -
## tol numeric - 1e-07 0 to Inf
## singular.ok logical - TRUE -
## Req Tunable Trafo
## fw.method - TRUE -
## fw.base.methods - TRUE -
## fw.perc - TRUE -
## fw.abs - TRUE -
## fw.threshold - TRUE -
## fw.fun - TRUE -
## fw.fun.args - TRUE -
## fw.mandatory.feat - TRUE -
## tol - TRUE -
## singular.ok - FALSE -
#hypermeter tuning the model
#parameter setting, means the in terms of usefulness the absolute value will be lowest at 2 and highest at 12
ps=makeParamSet(makeIntegerParam("fw.abs",2,20))
#search control
#we will use grid search for the best possible ans
sc=makeTuneControlGrid()
kfold=makeResampleDesc("CV", iters=10)
#tuningparameters
tune=tuneParams(filterwrapper,coffee.train, par.set=ps,control=sc,resampling=kfold,rmse)
tune## Tune result:
## Op. pars: fw.abs=10
## rmse.test.rmse=0.0193320
#new task with the filtered data
coffee.filter.feature=filterFeatures(coffee.train,fval=filtervals,abs = unlist(tune$x))hurray!! we got a rmse of 0.0193997. which is decent
##
## Call:
## stats::lm(formula = f, data = d)
##
## Coefficients:
## (Intercept) Aroma Flavor Aftertaste Acidity
## -0.01636 1.00127 0.99885 1.00415 0.99697
## Body Balance Uniformity Clean.Cup Sweetness
## 1.00062 1.00230 1.00397 1.00027 0.99786
## Cupper.Points
## 0.99536
## id truth response
## 1118 1 80.08 80.08173
## 1181 2 79.33 79.32910
## 648 3 82.58 82.58369
## 558 4 82.92 82.90909
## 65 5 85.58 85.58115
## 680 6 82.42 82.39724
## 187 7 84.33 84.33038
## 771 8 82.17 82.17195
## 827 9 81.92 81.91054
## 972 10 81.17 81.17100
## 281 11 83.83 83.84000
## 44 12 86.08 86.07702
## 405 13 83.33 83.35043
## 199 14 84.25 84.25183
## 289 15 83.83 83.82956
## 813 16 82.00 82.00047
## 520 17 83.00 82.98697
## 665 18 82.50 82.50128
## 1125 19 80.00 80.00534
## 352 20 83.50 83.49072
## 584 21 82.75 82.74014
## 603 22 82.75 82.75038
## 336 23 83.58 83.57947
## 697 24 82.42 82.41033
## 896 25 81.58 81.58012
## 653 26 82.58 82.58014
## 27 27 86.92 86.91954
## 591 28 82.75 82.75128
## 488 29 83.08 83.08079
## 435 30 83.25 83.24087
## 677 31 82.42 82.41926
## 357 32 83.50 83.49062
## 421 33 83.25 83.27159
## 989 34 81.08 81.07247
## 1210 35 78.75 78.75281
## 95 36 85.08 85.08012
## 1037 37 80.83 80.82214
## 29 38 86.67 86.66024
## 902 39 81.58 81.57682
## 1061 40 80.58 80.56895
## 958 41 81.33 81.32089
## 1117 42 80.08 80.09071
## 1079 43 80.50 80.50049
## 505 44 83.00 82.99924
## 1302 45 70.75 70.74507
## 1045 46 80.75 80.74104
## 1273 47 75.83 75.83004
## 1088 48 80.33 80.34039
## 84 49 85.25 85.25030
## 1128 50 80.00 79.98836
## 492 51 83.08 83.08055
## 343 52 83.58 83.59982
## 42 53 86.17 86.17146
## 860 54 81.75 81.75068
## 842 55 81.83 81.83030
## 309 56 83.75 83.75214
## 797 57 82.00 82.00071
## 786 58 82.08 82.07242
## 177 59 84.42 84.41962
## 540 60 82.92 82.91161
## 551 61 82.92 82.91978
## 189 62 84.33 84.33992
## 380 63 83.42 83.40729
## 1255 64 77.33 77.33118
## 431 65 83.25 83.23953
## 669 66 82.50 82.50038
## 294 67 83.83 83.82060
## 147 68 84.58 84.57954
## 1150 69 79.75 79.74066
## 598 70 82.75 82.73823
## 738 71 82.25 82.26113
## 961 72 81.33 81.33090
## 1289 73 74.33 74.33129
## 463 74 83.17 83.15067
## 924 75 81.50 81.50114
## 115 76 84.92 84.92138
## 744 77 82.25 82.26119
## 37 78 86.25 86.24907
## 544 79 82.92 82.90978
## 183 80 84.33 84.32963
## 1152 81 79.75 79.74066
## 313 82 83.75 83.75216
## 7 83 88.75 88.74912
## 1026 84 80.92 80.92147
## 361 85 83.50 83.51080
## 1283 86 74.83 74.82900
## 923 87 81.50 81.50082
## 997 88 81.08 81.09081
## 137 89 84.67 84.66007
## 910 90 81.58 81.58274
## 318 91 83.67 83.67164
## 456 92 83.17 83.16995
## 572 93 82.83 82.83142
## 194 94 84.25 84.24198
## 1066 95 80.58 80.58841
## 71 96 85.42 85.40874
## 867 97 81.75 81.74995
## 396 98 83.38 83.38607
## 728 99 82.33 82.32139
## 1233 100 78.25 78.24549
## 973 101 81.17 81.18164
## 700 102 82.42 82.41045
## 861 103 81.75 81.73757
## 727 104 82.33 82.33053
## 378 105 83.42 83.43014
## 708 106 82.33 82.34098
## 1196 107 79.08 79.07753
## 516 108 83.00 83.00234
## 207 109 84.25 84.25019
## 1074 110 80.50 80.50061
## 1161 111 79.67 79.66150
## 691 112 82.42 82.43065
## 625 113 82.67 82.67927
## 310 114 83.75 83.74081
## 767 115 82.17 82.17184
## 1080 116 80.42 80.42014
## 967 117 81.25 81.25074
## 311 118 83.75 83.75064
## 238 119 84.08 84.06794
## 145 120 84.58 84.58148
## 903 121 81.58 81.58115
## 546 122 82.92 82.91169
## 646 123 82.58 82.58060
## 2 124 89.92 89.91993
## 609 125 82.67 82.67034
## 414 126 83.33 83.34090
## 853 127 81.83 81.81924
## 783 128 82.08 82.09169
## 622 129 82.67 82.67057
## 499 130 83.00 83.00025
## 636 131 82.58 82.58047
## 178 132 84.42 84.42055
## 135 133 84.67 84.66242
## 220 134 84.17 84.17063
## 658 135 82.50 82.48551
## 952 136 81.33 81.33045
## 21 137 87.25 87.24879
## 1281 138 75.00 75.01064
## 695 139 82.42 82.40999
## 401 140 83.33 83.33046
## 555 141 82.92 82.91932
## 893 142 81.67 81.66950
## 926 143 81.50 81.51309
## 1068 144 80.58 80.56895
## 188 145 84.33 84.33988
## 995 146 81.08 81.08235
## 705 147 82.42 82.41275
## 587 148 82.75 82.75995
## 994 149 81.08 81.08128
## 314 150 83.75 83.75077
## 539 151 82.92 82.91246
## 17 152 87.42 87.41082
## 1078 153 80.50 80.51104
## 416 154 83.33 83.33197
## 703 155 82.42 82.43155
## 880 156 81.67 81.66143
## 1191 157 79.17 79.18158
## 1155 158 79.75 79.74131
## 527 159 83.00 83.00160
## 536 160 82.92 82.91147
## 523 161 83.00 82.98859
## 108 162 84.92 84.91110
## 239 163 84.08 84.09016
## 820 164 81.92 81.91316
## 1087 165 80.33 80.31959
## 668 166 82.50 82.49974
## 808 167 82.00 82.01031
## 45 168 86.08 86.07098
## 889 169 81.67 81.66178
## 1148 170 79.75 79.74962
## 89 171 85.08 85.07944
## 794 172 82.08 82.09422
## 35 173 86.25 86.24033
## 604 174 82.75 82.75153
## 510 175 83.00 83.00377
## 1038 176 80.75 80.75145
## 1200 177 79.00 79.00137
## 116 178 84.83 84.82059
## 1126 179 80.00 80.00259
## 748 180 82.25 82.25069
## 859 181 81.83 81.82025
## 14 182 87.92 87.90787
## 826 183 81.92 81.92272
## 519 184 83.00 83.00084
## 419 185 83.25 83.26031
## 1215 186 78.58 78.58119
## 1000 187 81.00 80.99147
## 917 188 81.50 81.48196
## 1033 189 80.83 80.83065
## 1260 190 77.00 77.00501
## 1039 191 80.75 80.74864
## 1136 192 79.92 79.89846
## 772 193 82.17 82.16008
## 871 194 81.67 81.67058
## 1298 195 72.33 72.34509
## 928 196 81.50 81.51108
## 432 197 83.25 83.24084
## 751 198 82.25 82.25024
## 1103 199 80.25 80.25256
## 450 200 83.17 83.17061
## 275 201 83.83 83.82985
## 198 202 84.25 84.26097
## 888 203 81.67 81.65911
## 308 204 83.75 83.75278
## 87 205 85.17 85.15906
## 756 206 82.25 82.25137
## 877 207 81.67 81.67026
## 673 208 82.50 82.49952
## 1108 209 80.17 80.15959
## 650 210 82.58 82.58062
## 258 211 84.00 83.98883
## 229 212 84.17 84.16185
## 1129 213 80.00 79.98936
## 1258 214 77.25 77.24888
## 54 215 85.92 85.91966
## 174 216 84.42 84.43167
## 63 217 85.58 85.59029
## 922 218 81.50 81.50078
## 5 219 88.83 88.82849
## 1145 220 79.75 79.73940
## 939 221 81.42 81.41265
## 1202 222 79.00 78.99913
## 208 223 84.25 84.24092
## 639 224 82.58 82.58159
## 85 225 85.25 85.25021
## 1005 226 81.00 81.00098
## 81 227 85.33 85.33052
## 734 228 82.33 82.32216
## 1093 229 80.33 80.32782
## 594 230 82.75 82.75013
## 974 231 81.17 81.16106
## 553 232 82.92 82.92145
## 283 233 83.83 83.83197
## 511 234 83.00 83.01019
## 718 235 82.33 82.32866
## 840 236 81.83 81.83778
## 561 237 82.83 82.81968
## 694 238 82.42 82.42160
## 1237 239 78.00 78.00093
## 1063 240 80.58 80.57055
## 348 241 83.58 83.59052
## 1206 242 78.92 78.92197
## 271 243 83.92 83.92178
## 500 244 83.00 83.00100
## 1308 245 67.92 67.92691
## 1194 246 79.08 79.07999
## 4 247 89.00 89.00869
## 1272 248 76.00 75.98708
## 720 249 82.33 82.33388
## 184 250 84.33 84.33057
## 1055 251 80.67 80.68135
## 221 252 84.17 84.15974
## 131 253 84.67 84.67260
## 276 254 83.83 83.83096
## 320 255 83.67 83.66840
## 789 256 82.08 82.09074
## 683 257 82.42 82.43110
## 732 258 82.33 82.31722
## 562 259 82.83 82.83103
## 134 260 84.67 84.66070
## 864 261 81.75 81.74994
## 153 262 84.58 84.58001
## 693 263 82.42 82.43088
## 344 264 83.58 83.60115
## 214 265 84.17 84.16047
## 470 266 83.17 83.17129
## 764 267 82.17 82.17121
## 392 268 83.42 83.42107
## 1245 269 77.92 77.91438
## 442 270 83.17 83.15947
## 280 271 83.83 83.82747
## 946 272 81.42 81.42048
## 464 273 83.17 83.17014
## 1040 274 80.75 80.76006
## 225 275 84.17 84.16098
## 640 276 82.58 82.59075
## 1284 277 74.75 74.74293
## 940 278 81.42 81.42108
## 1187 279 79.17 79.17047
## 43 280 86.17 86.18144
## 1209 281 78.75 78.73932
## 721 282 82.33 82.33192
## 80 283 85.33 85.32905
## 512 284 83.00 83.00022
## 971 285 81.25 81.24809
## 560 286 82.83 82.83078
## 798 287 82.00 82.01122
## 1182 288 79.33 79.32895
## 1173 289 79.50 79.48839
## 235 290 84.08 84.09110
## 920 291 81.50 81.50383
## 1231 292 78.33 78.32941
## 159 293 84.50 84.51043
## 624 294 82.67 82.67444
## 33 295 86.42 86.42209
## 550 296 82.92 82.90265
## 125 297 84.75 84.73670
## 1217 298 78.58 78.58380
## 82 299 85.33 85.31938
## 1106 300 80.17 80.16547
## 1070 301 80.50 80.50071
## 793 302 82.08 82.07084
## 6 303 88.83 88.83076
## 868 304 81.75 81.75102
## 99 305 85.00 84.99911
## 247 306 84.00 83.98898
## 158 307 84.58 84.58183
## 991 308 81.08 81.08226
## 916 309 81.50 81.49974
## 674 310 82.50 82.48991
## 72 311 85.42 85.41090
## 873 312 81.67 81.66984
## 1012 313 81.00 80.99114
## 428 314 83.25 83.26007
## 491 315 83.08 83.07662
## 119 316 84.83 84.83929
## 122 317 84.75 84.74931
## 410 318 83.33 83.35071
## 755 319 82.25 82.25262
## 215 320 84.17 84.17973
## 501 321 83.00 83.01007
## 1228 322 78.33 78.35282
## 753 323 82.25 82.25247
## 462 324 83.17 83.17199
## 1167 325 79.58 79.59098
## 12 326 87.92 87.91855
## 618 327 82.67 82.66422
## 1107 328 80.17 80.16038
## 243 329 84.00 83.99960
## 525 330 83.00 83.00197
## 230 331 84.17 84.16021
## 1212 332 78.75 78.75125
## 1268 333 76.17 76.15570
## 210 334 84.25 84.25111
## 475 335 83.17 83.16156
## 1147 336 79.75 79.75292
## 554 337 82.92 82.90029
## 829 338 81.92 81.90932
## 149 339 84.58 84.57989
## 984 340 81.17 81.17219
## 1102 341 80.25 80.24844
## 497 342 83.00 83.00058
## 676 343 82.42 82.41004
## 865 344 81.75 81.76002
## 270 345 83.92 83.93300
## 730 346 82.33 82.33013
## 1101 347 80.25 80.24132
## 1282 348 74.92 74.91649
## 1270 349 76.17 76.15161
## 1064 350 80.58 80.58245
## 856 351 81.83 81.83045
## 481 352 83.08 83.07139
## 1131 353 79.92 79.90751
## 30 354 86.58 86.58974
## 745 355 82.25 82.26068
## 1177 356 79.42 79.41098
## 20 357 87.25 87.24968
## 69 358 85.50 85.50952
## 886 359 81.67 81.67242
## 397 360 83.33 83.32211
## 834 361 81.83 81.84007
## 302 362 83.75 83.75039
## 534 363 82.92 82.91106
## 359 364 83.50 83.49928
## 892 365 81.67 81.68206
## 273 366 83.92 83.90947
## 996 367 81.08 81.08075
## 50 368 86.00 86.00045
## 1195 369 79.08 79.06922
## 1201 370 79.00 78.99934
## 15 371 87.83 87.82068
## 212 372 84.17 84.16083
## 541 373 82.92 82.92308
## 16 374 87.58 87.59153
## 900 375 81.58 81.59138
## 455 376 83.17 83.17085
## 566 377 82.83 82.83003
## 219 378 84.17 84.17120
## 433 379 83.25 83.24908
## 305 380 83.75 83.77019
## 226 381 84.17 84.17184
## 337 382 83.58 83.57003
## 1256 383 77.33 77.32813
## 124 384 84.75 84.75178
## 663 385 82.50 82.51259
## 461 386 83.17 83.16664
## 1002 387 81.00 81.01019
## 863 388 81.75 81.76323
## 945 389 81.42 81.42233
## 486 390 83.08 83.08167
## 1293 391 73.50 73.49214
## 151 392 84.58 84.58155
## 521 393 83.00 83.00038
## 1193 394 79.08 79.10785
## 251 395 84.00 84.00046
## 60 396 85.75 85.74941
## 882 397 81.67 81.66968
## 1286 398 74.67 74.64156
## 1208 399 78.75 78.75013
## 364 400 83.50 83.49083
## 96 401 85.08 85.09165
## 1169 402 79.58 79.56955
## 670 403 82.50 82.49211
## 919 404 81.50 81.51080
## 1071 405 80.50 80.49160
## 966 406 81.25 81.25973
## 1133 407 79.92 79.90823
## 260 408 83.92 83.90865
## 684 409 82.42 82.41997
## 1224 410 78.50 78.49764
## 91 411 85.08 85.08575
## 702 412 82.42 82.42077
## 600 413 82.75 82.74077
## 1090 414 80.33 80.33171
## 831 415 81.92 81.90786
## 1287 416 74.42 74.41926
## 237 417 84.08 84.08990
## 992 418 81.08 81.09227
## 1276 419 75.58 75.58531
## 150 420 84.58 84.58908
## 621 421 82.67 82.67135
## 1301 422 71.00 71.00091
## 8 423 88.67 88.65978
## 213 424 84.17 84.17964
## 785 425 82.08 82.10222
## 307 426 83.75 83.76090
## 1229 427 78.33 78.34073
## 774 428 82.17 82.17165
## 408 429 83.33 83.31838
## 1247 430 77.83 77.83763
## 493 431 83.08 83.06824
## 765 432 82.17 82.18144
## 522 433 83.00 83.00141
## 447 434 83.17 83.18103
## 1153 435 79.75 79.75137
## 473 436 83.17 83.17049
## 1251 437 77.58 77.58595
our predictions are very close it did pretty decent