prediction with linear regression

omon das

2023-09-13

library(tidyverse)
library(mlr)
library(imputeTS)
library(prettydoc)

#loading the dataset for our prediction

coffee=read.csv("D:\\wallpapers and photos\\csv\\coffee_dataset.csv")
names(coffee)
##  [1] "X"                     "Species"               "Owner"                
##  [4] "Country.of.Origin"     "Farm.Name"             "Lot.Number"           
##  [7] "Mill"                  "ICO.Number"            "Company"              
## [10] "Altitude"              "Region"                "Producer"             
## [13] "Number.of.Bags"        "Bag.Weight"            "In.Country.Partner"   
## [16] "Harvest.Year"          "Grading.Date"          "Owner.1"              
## [19] "Variety"               "Processing.Method"     "Aroma"                
## [22] "Flavor"                "Aftertaste"            "Acidity"              
## [25] "Body"                  "Balance"               "Uniformity"           
## [28] "Clean.Cup"             "Sweetness"             "Cupper.Points"        
## [31] "Total.Cup.Points"      "Moisture"              "Category.One.Defects" 
## [34] "Quakers"               "Color"                 "Category.Two.Defects" 
## [37] "Expiration"            "Certification.Body"    "Certification.Address"
## [40] "Certification.Contact" "unit_of_measurement"   "altitude_low_meters"  
## [43] "altitude_high_meters"  "altitude_mean_meters"

this is a very large data set . many things can be done but today we will just predict the coffee total cup points using liner regression

library(naniar)


coffee=coffee %>% 
  select_if(is.numeric)


vis_miss(coffee)

our predictor variable has no missing data. some of the data like altitude_low_meters,altitude_high_meters are missing. if we remove the NA and see the correlation between them and coffee total cup point is decent then we can impute them otherwise its not necessary

library(DataExplorer)
coffee_gap=coffee %>% 
  filter(!is.na(altitude_mean_meters),!is.na(Quakers))
coffee_gap %>% 
  select(Total.Cup.Points,altitude_mean_meters,Quakers) %>% 
  plot_correlation(ggtheme = theme_light(),title = "correlation between Total.Cup.Points vs altitude_mean_meters")

as the correlation is very weak we can remove the features that contain na value

coffee_new=coffee %>% 
  select(-c(altitude_high_meters,altitude_low_meters,altitude_mean_meters,X,Quakers))
#making a task

coffee.task=makeRegrTask(data = coffee_new, target ="Total.Cup.Points")

coffee.task
## Supervised task: coffee_new
## Type: regr
## Target: Total.Cup.Points
## Observations: 1311
## Features:
##    numerics     factors     ordered functionals 
##          14           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
#making a learner

coffee.learner=makeLearner("regr.lm")

#spliting the dataset
ho = makeResampleInstance("Holdout",coffee.task)
coffee.train = subsetTask(coffee.task,ho$train.inds[[1]])
coffee.test = subsetTask(coffee.task,ho$test.inds[[1]])


coffee.train
## Supervised task: coffee_new
## Type: regr
## Target: Total.Cup.Points
## Observations: 874
## Features:
##    numerics     factors     ordered functionals 
##          14           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
coffee.test
## Supervised task: coffee_new
## Type: regr
## Target: Total.Cup.Points
## Observations: 437
## Features:
##    numerics     factors     ordered functionals 
##          14           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
#this means the data will look like 


library(caTools)


sample=sample.split(coffee_new$Total.Cup.Points,SplitRatio = 2/3)
train=subset(coffee_new,sample==T)
test=subset(coffee_new,sample==F)
#automating feature selection
library(FSelectorRcpp)


filtervals=generateFilterValuesData(coffee.train,method =  "linear.correlation")

filtervals
## FilterValues:
## Task: coffee_new
##                     name    type             filter      value
##  1:               Flavor numeric linear.correlation 0.88699234
##  2:           Aftertaste numeric linear.correlation 0.87690312
##  3:              Balance numeric linear.correlation 0.84740461
##  4:                Aroma numeric linear.correlation 0.82166257
##  5:                 Body numeric linear.correlation 0.82089294
##  6:              Acidity numeric linear.correlation 0.81329005
##  7:        Cupper.Points numeric linear.correlation 0.79626669
##  8:           Uniformity numeric linear.correlation 0.69345325
##  9:            Sweetness numeric linear.correlation 0.67718583
## 10:            Clean.Cup numeric linear.correlation 0.67692154
## 11: Category.Two.Defects integer linear.correlation 0.21970485
## 12:             Moisture numeric linear.correlation 0.08766368
## 13: Category.One.Defects integer linear.correlation 0.07376721
## 14:       Number.of.Bags integer linear.correlation 0.02017343
plotFilterValues(filtervals)

this shows by using linear correlation we can identify which feature to use for our model

#making a filter wrapper to use them in our hypermeter tuning and sometimes this will work as a new learner

filterwrapper=makeFilterWrapper(learner = coffee.learner, fw.method = "linear.correlation")


getParamSet(filterwrapper)
##                       Type len    Def                                   Constr
## fw.method         discrete   -      - anova.test,auc,carscore,cforest.impor...
## fw.base.methods   discrete   -      - anova.test,auc,carscore,cforest.impor...
## fw.perc            numeric   -      -                                   0 to 1
## fw.abs             integer   -      -                                 0 to Inf
## fw.threshold       numeric   -      -                              -Inf to Inf
## fw.fun            function   -      -                                        -
## fw.fun.args        untyped   - <NULL>                                        -
## fw.mandatory.feat  untyped   -      -                                        -
## tol                numeric   -  1e-07                                 0 to Inf
## singular.ok        logical   -   TRUE                                        -
##                   Req Tunable Trafo
## fw.method           -    TRUE     -
## fw.base.methods     -    TRUE     -
## fw.perc             -    TRUE     -
## fw.abs              -    TRUE     -
## fw.threshold        -    TRUE     -
## fw.fun              -    TRUE     -
## fw.fun.args         -    TRUE     -
## fw.mandatory.feat   -    TRUE     -
## tol                 -    TRUE     -
## singular.ok         -   FALSE     -
#hypermeter tuning the model


#parameter setting, means the in terms of usefulness the absolute value will be lowest at 2 and highest at 12

ps=makeParamSet(makeIntegerParam("fw.abs",2,20))


#search control

#we will use grid search for the best possible ans

sc=makeTuneControlGrid()


kfold=makeResampleDesc("CV", iters=10)

#tuningparameters



tune=tuneParams(filterwrapper,coffee.train, par.set=ps,control=sc,resampling=kfold,rmse)

tune
## Tune result:
## Op. pars: fw.abs=10
## rmse.test.rmse=0.0193320
#new task with the filtered data

coffee.filter.feature=filterFeatures(coffee.train,fval=filtervals,abs = unlist(tune$x))

hurray!! we got a rmse of 0.0193997. which is decent

train.model=train(coffee.learner,coffee.filter.feature)
getLearnerModel(train.model)
## 
## Call:
## stats::lm(formula = f, data = d)
## 
## Coefficients:
##   (Intercept)          Aroma         Flavor     Aftertaste        Acidity  
##      -0.01636        1.00127        0.99885        1.00415        0.99697  
##          Body        Balance     Uniformity      Clean.Cup      Sweetness  
##       1.00062        1.00230        1.00397        1.00027        0.99786  
## Cupper.Points  
##       0.99536
pred=predict(train.model,coffee.test)
as.data.frame(pred)
##       id truth response
## 1118   1 80.08 80.08173
## 1181   2 79.33 79.32910
## 648    3 82.58 82.58369
## 558    4 82.92 82.90909
## 65     5 85.58 85.58115
## 680    6 82.42 82.39724
## 187    7 84.33 84.33038
## 771    8 82.17 82.17195
## 827    9 81.92 81.91054
## 972   10 81.17 81.17100
## 281   11 83.83 83.84000
## 44    12 86.08 86.07702
## 405   13 83.33 83.35043
## 199   14 84.25 84.25183
## 289   15 83.83 83.82956
## 813   16 82.00 82.00047
## 520   17 83.00 82.98697
## 665   18 82.50 82.50128
## 1125  19 80.00 80.00534
## 352   20 83.50 83.49072
## 584   21 82.75 82.74014
## 603   22 82.75 82.75038
## 336   23 83.58 83.57947
## 697   24 82.42 82.41033
## 896   25 81.58 81.58012
## 653   26 82.58 82.58014
## 27    27 86.92 86.91954
## 591   28 82.75 82.75128
## 488   29 83.08 83.08079
## 435   30 83.25 83.24087
## 677   31 82.42 82.41926
## 357   32 83.50 83.49062
## 421   33 83.25 83.27159
## 989   34 81.08 81.07247
## 1210  35 78.75 78.75281
## 95    36 85.08 85.08012
## 1037  37 80.83 80.82214
## 29    38 86.67 86.66024
## 902   39 81.58 81.57682
## 1061  40 80.58 80.56895
## 958   41 81.33 81.32089
## 1117  42 80.08 80.09071
## 1079  43 80.50 80.50049
## 505   44 83.00 82.99924
## 1302  45 70.75 70.74507
## 1045  46 80.75 80.74104
## 1273  47 75.83 75.83004
## 1088  48 80.33 80.34039
## 84    49 85.25 85.25030
## 1128  50 80.00 79.98836
## 492   51 83.08 83.08055
## 343   52 83.58 83.59982
## 42    53 86.17 86.17146
## 860   54 81.75 81.75068
## 842   55 81.83 81.83030
## 309   56 83.75 83.75214
## 797   57 82.00 82.00071
## 786   58 82.08 82.07242
## 177   59 84.42 84.41962
## 540   60 82.92 82.91161
## 551   61 82.92 82.91978
## 189   62 84.33 84.33992
## 380   63 83.42 83.40729
## 1255  64 77.33 77.33118
## 431   65 83.25 83.23953
## 669   66 82.50 82.50038
## 294   67 83.83 83.82060
## 147   68 84.58 84.57954
## 1150  69 79.75 79.74066
## 598   70 82.75 82.73823
## 738   71 82.25 82.26113
## 961   72 81.33 81.33090
## 1289  73 74.33 74.33129
## 463   74 83.17 83.15067
## 924   75 81.50 81.50114
## 115   76 84.92 84.92138
## 744   77 82.25 82.26119
## 37    78 86.25 86.24907
## 544   79 82.92 82.90978
## 183   80 84.33 84.32963
## 1152  81 79.75 79.74066
## 313   82 83.75 83.75216
## 7     83 88.75 88.74912
## 1026  84 80.92 80.92147
## 361   85 83.50 83.51080
## 1283  86 74.83 74.82900
## 923   87 81.50 81.50082
## 997   88 81.08 81.09081
## 137   89 84.67 84.66007
## 910   90 81.58 81.58274
## 318   91 83.67 83.67164
## 456   92 83.17 83.16995
## 572   93 82.83 82.83142
## 194   94 84.25 84.24198
## 1066  95 80.58 80.58841
## 71    96 85.42 85.40874
## 867   97 81.75 81.74995
## 396   98 83.38 83.38607
## 728   99 82.33 82.32139
## 1233 100 78.25 78.24549
## 973  101 81.17 81.18164
## 700  102 82.42 82.41045
## 861  103 81.75 81.73757
## 727  104 82.33 82.33053
## 378  105 83.42 83.43014
## 708  106 82.33 82.34098
## 1196 107 79.08 79.07753
## 516  108 83.00 83.00234
## 207  109 84.25 84.25019
## 1074 110 80.50 80.50061
## 1161 111 79.67 79.66150
## 691  112 82.42 82.43065
## 625  113 82.67 82.67927
## 310  114 83.75 83.74081
## 767  115 82.17 82.17184
## 1080 116 80.42 80.42014
## 967  117 81.25 81.25074
## 311  118 83.75 83.75064
## 238  119 84.08 84.06794
## 145  120 84.58 84.58148
## 903  121 81.58 81.58115
## 546  122 82.92 82.91169
## 646  123 82.58 82.58060
## 2    124 89.92 89.91993
## 609  125 82.67 82.67034
## 414  126 83.33 83.34090
## 853  127 81.83 81.81924
## 783  128 82.08 82.09169
## 622  129 82.67 82.67057
## 499  130 83.00 83.00025
## 636  131 82.58 82.58047
## 178  132 84.42 84.42055
## 135  133 84.67 84.66242
## 220  134 84.17 84.17063
## 658  135 82.50 82.48551
## 952  136 81.33 81.33045
## 21   137 87.25 87.24879
## 1281 138 75.00 75.01064
## 695  139 82.42 82.40999
## 401  140 83.33 83.33046
## 555  141 82.92 82.91932
## 893  142 81.67 81.66950
## 926  143 81.50 81.51309
## 1068 144 80.58 80.56895
## 188  145 84.33 84.33988
## 995  146 81.08 81.08235
## 705  147 82.42 82.41275
## 587  148 82.75 82.75995
## 994  149 81.08 81.08128
## 314  150 83.75 83.75077
## 539  151 82.92 82.91246
## 17   152 87.42 87.41082
## 1078 153 80.50 80.51104
## 416  154 83.33 83.33197
## 703  155 82.42 82.43155
## 880  156 81.67 81.66143
## 1191 157 79.17 79.18158
## 1155 158 79.75 79.74131
## 527  159 83.00 83.00160
## 536  160 82.92 82.91147
## 523  161 83.00 82.98859
## 108  162 84.92 84.91110
## 239  163 84.08 84.09016
## 820  164 81.92 81.91316
## 1087 165 80.33 80.31959
## 668  166 82.50 82.49974
## 808  167 82.00 82.01031
## 45   168 86.08 86.07098
## 889  169 81.67 81.66178
## 1148 170 79.75 79.74962
## 89   171 85.08 85.07944
## 794  172 82.08 82.09422
## 35   173 86.25 86.24033
## 604  174 82.75 82.75153
## 510  175 83.00 83.00377
## 1038 176 80.75 80.75145
## 1200 177 79.00 79.00137
## 116  178 84.83 84.82059
## 1126 179 80.00 80.00259
## 748  180 82.25 82.25069
## 859  181 81.83 81.82025
## 14   182 87.92 87.90787
## 826  183 81.92 81.92272
## 519  184 83.00 83.00084
## 419  185 83.25 83.26031
## 1215 186 78.58 78.58119
## 1000 187 81.00 80.99147
## 917  188 81.50 81.48196
## 1033 189 80.83 80.83065
## 1260 190 77.00 77.00501
## 1039 191 80.75 80.74864
## 1136 192 79.92 79.89846
## 772  193 82.17 82.16008
## 871  194 81.67 81.67058
## 1298 195 72.33 72.34509
## 928  196 81.50 81.51108
## 432  197 83.25 83.24084
## 751  198 82.25 82.25024
## 1103 199 80.25 80.25256
## 450  200 83.17 83.17061
## 275  201 83.83 83.82985
## 198  202 84.25 84.26097
## 888  203 81.67 81.65911
## 308  204 83.75 83.75278
## 87   205 85.17 85.15906
## 756  206 82.25 82.25137
## 877  207 81.67 81.67026
## 673  208 82.50 82.49952
## 1108 209 80.17 80.15959
## 650  210 82.58 82.58062
## 258  211 84.00 83.98883
## 229  212 84.17 84.16185
## 1129 213 80.00 79.98936
## 1258 214 77.25 77.24888
## 54   215 85.92 85.91966
## 174  216 84.42 84.43167
## 63   217 85.58 85.59029
## 922  218 81.50 81.50078
## 5    219 88.83 88.82849
## 1145 220 79.75 79.73940
## 939  221 81.42 81.41265
## 1202 222 79.00 78.99913
## 208  223 84.25 84.24092
## 639  224 82.58 82.58159
## 85   225 85.25 85.25021
## 1005 226 81.00 81.00098
## 81   227 85.33 85.33052
## 734  228 82.33 82.32216
## 1093 229 80.33 80.32782
## 594  230 82.75 82.75013
## 974  231 81.17 81.16106
## 553  232 82.92 82.92145
## 283  233 83.83 83.83197
## 511  234 83.00 83.01019
## 718  235 82.33 82.32866
## 840  236 81.83 81.83778
## 561  237 82.83 82.81968
## 694  238 82.42 82.42160
## 1237 239 78.00 78.00093
## 1063 240 80.58 80.57055
## 348  241 83.58 83.59052
## 1206 242 78.92 78.92197
## 271  243 83.92 83.92178
## 500  244 83.00 83.00100
## 1308 245 67.92 67.92691
## 1194 246 79.08 79.07999
## 4    247 89.00 89.00869
## 1272 248 76.00 75.98708
## 720  249 82.33 82.33388
## 184  250 84.33 84.33057
## 1055 251 80.67 80.68135
## 221  252 84.17 84.15974
## 131  253 84.67 84.67260
## 276  254 83.83 83.83096
## 320  255 83.67 83.66840
## 789  256 82.08 82.09074
## 683  257 82.42 82.43110
## 732  258 82.33 82.31722
## 562  259 82.83 82.83103
## 134  260 84.67 84.66070
## 864  261 81.75 81.74994
## 153  262 84.58 84.58001
## 693  263 82.42 82.43088
## 344  264 83.58 83.60115
## 214  265 84.17 84.16047
## 470  266 83.17 83.17129
## 764  267 82.17 82.17121
## 392  268 83.42 83.42107
## 1245 269 77.92 77.91438
## 442  270 83.17 83.15947
## 280  271 83.83 83.82747
## 946  272 81.42 81.42048
## 464  273 83.17 83.17014
## 1040 274 80.75 80.76006
## 225  275 84.17 84.16098
## 640  276 82.58 82.59075
## 1284 277 74.75 74.74293
## 940  278 81.42 81.42108
## 1187 279 79.17 79.17047
## 43   280 86.17 86.18144
## 1209 281 78.75 78.73932
## 721  282 82.33 82.33192
## 80   283 85.33 85.32905
## 512  284 83.00 83.00022
## 971  285 81.25 81.24809
## 560  286 82.83 82.83078
## 798  287 82.00 82.01122
## 1182 288 79.33 79.32895
## 1173 289 79.50 79.48839
## 235  290 84.08 84.09110
## 920  291 81.50 81.50383
## 1231 292 78.33 78.32941
## 159  293 84.50 84.51043
## 624  294 82.67 82.67444
## 33   295 86.42 86.42209
## 550  296 82.92 82.90265
## 125  297 84.75 84.73670
## 1217 298 78.58 78.58380
## 82   299 85.33 85.31938
## 1106 300 80.17 80.16547
## 1070 301 80.50 80.50071
## 793  302 82.08 82.07084
## 6    303 88.83 88.83076
## 868  304 81.75 81.75102
## 99   305 85.00 84.99911
## 247  306 84.00 83.98898
## 158  307 84.58 84.58183
## 991  308 81.08 81.08226
## 916  309 81.50 81.49974
## 674  310 82.50 82.48991
## 72   311 85.42 85.41090
## 873  312 81.67 81.66984
## 1012 313 81.00 80.99114
## 428  314 83.25 83.26007
## 491  315 83.08 83.07662
## 119  316 84.83 84.83929
## 122  317 84.75 84.74931
## 410  318 83.33 83.35071
## 755  319 82.25 82.25262
## 215  320 84.17 84.17973
## 501  321 83.00 83.01007
## 1228 322 78.33 78.35282
## 753  323 82.25 82.25247
## 462  324 83.17 83.17199
## 1167 325 79.58 79.59098
## 12   326 87.92 87.91855
## 618  327 82.67 82.66422
## 1107 328 80.17 80.16038
## 243  329 84.00 83.99960
## 525  330 83.00 83.00197
## 230  331 84.17 84.16021
## 1212 332 78.75 78.75125
## 1268 333 76.17 76.15570
## 210  334 84.25 84.25111
## 475  335 83.17 83.16156
## 1147 336 79.75 79.75292
## 554  337 82.92 82.90029
## 829  338 81.92 81.90932
## 149  339 84.58 84.57989
## 984  340 81.17 81.17219
## 1102 341 80.25 80.24844
## 497  342 83.00 83.00058
## 676  343 82.42 82.41004
## 865  344 81.75 81.76002
## 270  345 83.92 83.93300
## 730  346 82.33 82.33013
## 1101 347 80.25 80.24132
## 1282 348 74.92 74.91649
## 1270 349 76.17 76.15161
## 1064 350 80.58 80.58245
## 856  351 81.83 81.83045
## 481  352 83.08 83.07139
## 1131 353 79.92 79.90751
## 30   354 86.58 86.58974
## 745  355 82.25 82.26068
## 1177 356 79.42 79.41098
## 20   357 87.25 87.24968
## 69   358 85.50 85.50952
## 886  359 81.67 81.67242
## 397  360 83.33 83.32211
## 834  361 81.83 81.84007
## 302  362 83.75 83.75039
## 534  363 82.92 82.91106
## 359  364 83.50 83.49928
## 892  365 81.67 81.68206
## 273  366 83.92 83.90947
## 996  367 81.08 81.08075
## 50   368 86.00 86.00045
## 1195 369 79.08 79.06922
## 1201 370 79.00 78.99934
## 15   371 87.83 87.82068
## 212  372 84.17 84.16083
## 541  373 82.92 82.92308
## 16   374 87.58 87.59153
## 900  375 81.58 81.59138
## 455  376 83.17 83.17085
## 566  377 82.83 82.83003
## 219  378 84.17 84.17120
## 433  379 83.25 83.24908
## 305  380 83.75 83.77019
## 226  381 84.17 84.17184
## 337  382 83.58 83.57003
## 1256 383 77.33 77.32813
## 124  384 84.75 84.75178
## 663  385 82.50 82.51259
## 461  386 83.17 83.16664
## 1002 387 81.00 81.01019
## 863  388 81.75 81.76323
## 945  389 81.42 81.42233
## 486  390 83.08 83.08167
## 1293 391 73.50 73.49214
## 151  392 84.58 84.58155
## 521  393 83.00 83.00038
## 1193 394 79.08 79.10785
## 251  395 84.00 84.00046
## 60   396 85.75 85.74941
## 882  397 81.67 81.66968
## 1286 398 74.67 74.64156
## 1208 399 78.75 78.75013
## 364  400 83.50 83.49083
## 96   401 85.08 85.09165
## 1169 402 79.58 79.56955
## 670  403 82.50 82.49211
## 919  404 81.50 81.51080
## 1071 405 80.50 80.49160
## 966  406 81.25 81.25973
## 1133 407 79.92 79.90823
## 260  408 83.92 83.90865
## 684  409 82.42 82.41997
## 1224 410 78.50 78.49764
## 91   411 85.08 85.08575
## 702  412 82.42 82.42077
## 600  413 82.75 82.74077
## 1090 414 80.33 80.33171
## 831  415 81.92 81.90786
## 1287 416 74.42 74.41926
## 237  417 84.08 84.08990
## 992  418 81.08 81.09227
## 1276 419 75.58 75.58531
## 150  420 84.58 84.58908
## 621  421 82.67 82.67135
## 1301 422 71.00 71.00091
## 8    423 88.67 88.65978
## 213  424 84.17 84.17964
## 785  425 82.08 82.10222
## 307  426 83.75 83.76090
## 1229 427 78.33 78.34073
## 774  428 82.17 82.17165
## 408  429 83.33 83.31838
## 1247 430 77.83 77.83763
## 493  431 83.08 83.06824
## 765  432 82.17 82.18144
## 522  433 83.00 83.00141
## 447  434 83.17 83.18103
## 1153 435 79.75 79.75137
## 473  436 83.17 83.17049
## 1251 437 77.58 77.58595

our predictions are very close it did pretty decent