library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
data<-weather
str(data)
## 'data.frame': 366 obs. of 24 variables:
## $ Date : Date, format: "2007-11-01" "2007-11-02" ...
## $ Location : Factor w/ 49 levels "Adelaide","Albany",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ MinTemp : num 8 14 13.7 13.3 7.6 6.2 6.1 8.3 8.8 8.4 ...
## $ MaxTemp : num 24.3 26.9 23.4 15.5 16.1 16.9 18.2 17 19.5 22.8 ...
## $ Rainfall : num 0 3.6 3.6 39.8 2.8 0 0.2 0 0 16.2 ...
## $ Evaporation : num 3.4 4.4 5.8 7.2 5.6 5.8 4.2 5.6 4 5.4 ...
## $ Sunshine : num 6.3 9.7 3.3 9.1 10.6 8.2 8.4 4.6 4.1 7.7 ...
## $ WindGustDir : Ord.factor w/ 16 levels "N"<"NNE"<"NE"<..: 15 4 15 15 8 7 7 5 9 5 ...
## $ WindGustSpeed: num 30 39 85 54 50 44 43 41 48 31 ...
## $ WindDir9am : Ord.factor w/ 16 levels "N"<"NNE"<"NE"<..: 11 5 1 14 8 7 7 7 5 9 ...
## $ WindDir3pm : Ord.factor w/ 16 levels "N"<"NNE"<"NE"<..: 15 13 2 13 6 5 6 5 4 6 ...
## $ WindSpeed9am : num 6 4 6 30 20 20 19 11 19 7 ...
## $ WindSpeed3pm : num 20 17 6 24 28 24 26 24 17 6 ...
## $ Humidity9am : int 68 80 82 62 68 70 63 65 70 82 ...
## $ Humidity3pm : int 29 36 69 56 49 57 47 57 48 32 ...
## $ Pressure9am : num 1020 1012 1010 1006 1018 ...
## $ Pressure3pm : num 1015 1008 1007 1007 1018 ...
## $ Cloud9am : int 7 5 8 2 7 7 4 6 7 7 ...
## $ Cloud3pm : int 7 3 7 7 7 5 6 7 7 1 ...
## $ Temp9am : num 14.4 17.5 15.4 13.5 11.1 10.9 12.4 12.1 14.1 13.3 ...
## $ Temp3pm : num 23.6 25.7 20.2 14.1 15.4 14.8 17.3 15.5 18.9 21.7 ...
## $ RainToday : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 1 1 2 ...
## $ RISK_MM : num 3.6 3.6 39.8 2.8 0 0.2 0 0 16.2 0 ...
## $ RainTomorrow : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 1 1 1 2 1 ...
## 75% of the sample size
# you may also use sample(x, size, replace = FALSE, prob = NULL)
smp_size <- floor(0.75 * nrow(data))
## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
train <- data[train_ind, ]
test <- data[-train_ind, ]
#visual check on the data
View(train)
View(test)
#load the data
attach(train)
# Plot the data
plot(train$Cloud9am,train$Evaporation, pch=16)

plot(train$Cloud9am,train$Pressure9am, pch=16)

# Create a linear regression model
model<-lm(train$Cloud9am ~ train$Evaporation+train$Pressure9am, train)
summary(model)
##
## Call:
## lm(formula = train$Cloud9am ~ train$Evaporation + train$Pressure9am,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4394 -2.7544 -0.6714 2.8075 5.2829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 127.70079 29.33022 4.354 1.90e-05 ***
## train$Evaporation -0.25670 0.07246 -3.543 0.000466 ***
## train$Pressure9am -0.12032 0.02863 -4.203 3.58e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.869 on 271 degrees of freedom
## Multiple R-squared: 0.0737, Adjusted R-squared: 0.06686
## F-statistic: 10.78 on 2 and 271 DF, p-value: 3.126e-05
model
##
## Call:
## lm(formula = train$Cloud9am ~ train$Evaporation + train$Pressure9am,
## data = train)
##
## Coefficients:
## (Intercept) train$Evaporation train$Pressure9am
## 127.7008 -0.2567 -0.1203
#add the fitted line
plot(train$Cloud9am, pch=16)
abline(model)
## Warning in abline(model): only using the first two of 3 regression
## coefficients
# make a prediction for each X
predictedY <- predict(model, train)
predictedY
## 106 288 149 321 341 17 191 363
## 4.555741 3.836217 4.362417 3.640459 5.004916 3.799275 4.023930 1.996020
## 198 164 362 161 240 203 37 316
## 4.480338 3.889144 3.492052 3.569111 3.162421 3.764835 4.557330 4.347985
## 87 15 115 332 308 354 221 342
## 3.495263 2.931344 4.487503 3.404651 2.618578 2.991538 2.739710 3.972557
## 225 242 185 202 98 50 324 303
## 4.220466 4.060838 3.850648 4.180347 6.408680 5.068321 4.482738 3.186485
## 231 265 9 159 251 72 105 76
## 3.910843 3.928485 3.212949 3.059743 4.421800 3.547379 5.355468 4.336730
## 47 135 325 120 337 45 75 364
## 4.543709 2.812646 4.130563 4.226008 3.915608 4.118531 2.717133 2.336162
## 85 272 349 140 330 39 176 65
## 3.174419 4.403346 3.375753 2.410754 2.838366 4.386447 4.015109 3.839394
## 40 233 276 348 204 29 117 84
## 3.302716 4.675285 5.142913 3.273886 3.054943 3.297139 4.072025 3.292340
## 247 136 244 243 237 131 224 186
## 3.248268 3.030034 5.000116 4.694515 3.627683 2.689925 4.573418 4.032752
## 209 1 139 311 111 178 102 32
## 3.979824 4.137018 3.159987 4.547731 3.824962 5.229537 4.254905 5.296930
## 70 360 119 296 30 123 346 250
## 3.459978 3.783255 3.741515 4.486760 4.643176 2.624155 2.649030 4.084902
## 302 49 352 180 95 275 88 51
## 2.909747 3.571477 2.840732 5.492620 3.881911 6.099057 4.109709 5.365912
## 212 26 126 137 160 89 129 252
## 2.872063 3.264254 2.771750 2.043371 2.908936 3.294706 3.176041 4.768330
## 127 309 238 158 366 38 345 77
## 2.964263 2.519922 3.702275 3.292374 4.684849 4.521233 2.785405 3.687742
## 16 256 182 36 138 260 146 100
## 3.070118 4.187580 4.108966 3.577899 2.454083 4.855766 5.434082 5.079508
## 266 79 327 54 90 249 257 22
## 2.938645 2.810212 4.178691 4.248484 2.616888 3.462445 4.603092 3.731071
## 34 357 147 210 259 172 121 153
## 5.526317 3.332458 5.762159 3.696665 3.966981 2.857597 4.180313 2.964263
## 190 181 283 278 71 93 3 41
## 3.675778 4.439442 4.222865 4.249329 3.544202 3.719884 4.748220 3.982190
## 188 52 53 361 299 331 184 107
## 4.495581 6.671763 4.234829 3.516927 3.079007 3.399851 4.562974 3.512128
## 335 220 24 83 232 46 343 294
## 3.765613 3.699876 4.581428 4.000644 4.518868 4.522011 3.593142 3.479277
## 104 73 133 255 329 108 319 44
## 4.386447 2.923266 2.778949 3.760036 3.145589 2.776549 4.242840 4.072836
## 82 213 124 314 168 145 264 284
## 4.570950 3.181686 2.886460 3.403062 3.150389 3.897965 6.087836 4.983285
## 226 101 165 109 157 58 297 277
## 4.910282 5.426849 4.344774 2.448473 3.526594 3.099793 2.877639 4.264572
## 187 317 48 365 163 246 183 57
## 4.124986 4.339941 3.207304 3.569889 3.980601 3.737560 4.182747 3.450345
## 261 201 162 80 286 112 269 96
## 5.875281 4.673662 4.035963 5.049833 4.355218 4.063981 4.408957 3.593919
## 310 217 99 214 66 142 59 207
## 3.684633 3.935718 5.706764 2.929012 4.060770 3.181618 3.628427 3.901210
## 27 28 300 166 241 206 8 258
## 3.159176 3.940483 3.785689 3.901987 3.314038 3.859503 2.790204 4.150673
## 197 61 122 301 42 315 328 298
## 3.936495 2.143615 3.285918 3.277131 3.763213 3.610007 3.727894 2.982751
## 152 56 67 78 97 268 318 340
## 4.209177 3.253844 4.891794 3.715862 5.458146 3.979824 4.101699 4.124175
## 205 355 35 150 194 173 20 103
## 3.334892 2.854352 3.419048 4.690493 3.903610 3.677400 3.100604 4.159460
## 69 211 239 292 313 229 273 116
## 2.924111 3.209772 4.462695 3.577933 4.285425 3.367777 4.510824 3.759934
## 307 86 282 130 68 31 60 235
## 3.155999 3.216126 4.638377 3.112670 3.589897 4.959221 2.822244 3.844261
## 19 359 271 208 143 215 230 338
## 3.423070 3.202505 5.478222 3.666179 3.821785 3.477688 3.528216 2.851919
## 216 177 154 132 2 312 289 293
## 3.796943 4.472328 4.290224 2.993938 4.758664 4.483549 4.481926 3.253878
## 7 254
## 3.342091 5.011371
# display the predictions
points(train$Cloud9am, predictedY, col = "blue", pch=4)
rmse <- function(error)
{
sqrt(mean(error^2))
}
error <- model$residuals
predictionRMSE <- rmse(error)
predictionRMSE
## [1] 2.853694
#using Support Vector Machine
library(e1071)
model2 <- svm(train$Cloud9am ~ train$Evaporation+train$Pressure9am, train)
predictedY2 <- predict(model2, train)
points(train$Cloud9am, predictedY2, col = "red", pch=4)

error2 <- model2$residuals
predictionRMSE <- rmse(error2)
predictionRMSE
## [1] 2.860998
#Tuning the Result
tuneResult <- tune(svm, train$Cloud9am ~ train$Evaporation+train$Pressure9am, data = train,
ranges = list(epsilon = seq(0,0.2,0.01), cost = 2^(2:9))
)
print(tuneResult)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.2 128
##
## - best performance: 7.963795
plot(tuneResult)

#the darker the region is the better our model is (because the RMSE is closer to zero in darker regions)
#this means we can try another grid and search in a narrower range
tuneResult <- tune(svm, train$Cloud9am ~ train$Evaporation+train$Pressure9am, data = train,
ranges = list(epsilon = seq(0.18,0.2,0.01), cost = 2^(2:9))
)
print(tuneResult)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.2 128
##
## - best performance: 7.855223
plot(tuneResult)

#R allows us to get best model easily
tunedModel <- tuneResult$best.model
tunedModelY <- predict(tunedModel, train)
error <- tuneResult$best.model$residuals
tunedModelRMSE <- rmse(error)
tunedModelRMSE #the tune method randomly shuffles the data
## [1] 2.762332