SVM_of_Weather_dataset_from

library(rattle)

## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

data<-weather
str(data)

## 'data.frame':    366 obs. of  24 variables:
##  $ Date         : Date, format: "2007-11-01" "2007-11-02" ...
##  $ Location     : Factor w/ 49 levels "Adelaide","Albany",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ MinTemp      : num  8 14 13.7 13.3 7.6 6.2 6.1 8.3 8.8 8.4 ...
##  $ MaxTemp      : num  24.3 26.9 23.4 15.5 16.1 16.9 18.2 17 19.5 22.8 ...
##  $ Rainfall     : num  0 3.6 3.6 39.8 2.8 0 0.2 0 0 16.2 ...
##  $ Evaporation  : num  3.4 4.4 5.8 7.2 5.6 5.8 4.2 5.6 4 5.4 ...
##  $ Sunshine     : num  6.3 9.7 3.3 9.1 10.6 8.2 8.4 4.6 4.1 7.7 ...
##  $ WindGustDir  : Ord.factor w/ 16 levels "N"<"NNE"<"NE"<..: 15 4 15 15 8 7 7 5 9 5 ...
##  $ WindGustSpeed: num  30 39 85 54 50 44 43 41 48 31 ...
##  $ WindDir9am   : Ord.factor w/ 16 levels "N"<"NNE"<"NE"<..: 11 5 1 14 8 7 7 7 5 9 ...
##  $ WindDir3pm   : Ord.factor w/ 16 levels "N"<"NNE"<"NE"<..: 15 13 2 13 6 5 6 5 4 6 ...
##  $ WindSpeed9am : num  6 4 6 30 20 20 19 11 19 7 ...
##  $ WindSpeed3pm : num  20 17 6 24 28 24 26 24 17 6 ...
##  $ Humidity9am  : int  68 80 82 62 68 70 63 65 70 82 ...
##  $ Humidity3pm  : int  29 36 69 56 49 57 47 57 48 32 ...
##  $ Pressure9am  : num  1020 1012 1010 1006 1018 ...
##  $ Pressure3pm  : num  1015 1008 1007 1007 1018 ...
##  $ Cloud9am     : int  7 5 8 2 7 7 4 6 7 7 ...
##  $ Cloud3pm     : int  7 3 7 7 7 5 6 7 7 1 ...
##  $ Temp9am      : num  14.4 17.5 15.4 13.5 11.1 10.9 12.4 12.1 14.1 13.3 ...
##  $ Temp3pm      : num  23.6 25.7 20.2 14.1 15.4 14.8 17.3 15.5 18.9 21.7 ...
##  $ RainToday    : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 1 1 2 ...
##  $ RISK_MM      : num  3.6 3.6 39.8 2.8 0 0.2 0 0 16.2 0 ...
##  $ RainTomorrow : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 1 1 1 2 1 ...

## 75% of the sample size
# you may also use sample(x, size, replace = FALSE, prob = NULL)
smp_size <- floor(0.75 * nrow(data))

## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

train <- data[train_ind, ]
test <- data[-train_ind, ]

#visual check on the data
View(train)
View(test)
#load the data
attach(train)
# Plot the data
plot(train$Cloud9am,train$Evaporation, pch=16)

plot(train$Cloud9am,train$Pressure9am, pch=16)

# Create a linear regression model
model<-lm(train$Cloud9am ~ train$Evaporation+train$Pressure9am, train)
summary(model)

## 
## Call:
## lm(formula = train$Cloud9am ~ train$Evaporation + train$Pressure9am, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4394 -2.7544 -0.6714  2.8075  5.2829 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       127.70079   29.33022   4.354 1.90e-05 ***
## train$Evaporation  -0.25670    0.07246  -3.543 0.000466 ***
## train$Pressure9am  -0.12032    0.02863  -4.203 3.58e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.869 on 271 degrees of freedom
## Multiple R-squared:  0.0737, Adjusted R-squared:  0.06686 
## F-statistic: 10.78 on 2 and 271 DF,  p-value: 3.126e-05

model

## 
## Call:
## lm(formula = train$Cloud9am ~ train$Evaporation + train$Pressure9am, 
##     data = train)
## 
## Coefficients:
##       (Intercept)  train$Evaporation  train$Pressure9am  
##          127.7008            -0.2567            -0.1203

#add the fitted line
plot(train$Cloud9am, pch=16)
abline(model)

## Warning in abline(model): only using the first two of 3 regression
## coefficients

# make a prediction for each X
predictedY <- predict(model, train)
predictedY

##      106      288      149      321      341       17      191      363 
## 4.555741 3.836217 4.362417 3.640459 5.004916 3.799275 4.023930 1.996020 
##      198      164      362      161      240      203       37      316 
## 4.480338 3.889144 3.492052 3.569111 3.162421 3.764835 4.557330 4.347985 
##       87       15      115      332      308      354      221      342 
## 3.495263 2.931344 4.487503 3.404651 2.618578 2.991538 2.739710 3.972557 
##      225      242      185      202       98       50      324      303 
## 4.220466 4.060838 3.850648 4.180347 6.408680 5.068321 4.482738 3.186485 
##      231      265        9      159      251       72      105       76 
## 3.910843 3.928485 3.212949 3.059743 4.421800 3.547379 5.355468 4.336730 
##       47      135      325      120      337       45       75      364 
## 4.543709 2.812646 4.130563 4.226008 3.915608 4.118531 2.717133 2.336162 
##       85      272      349      140      330       39      176       65 
## 3.174419 4.403346 3.375753 2.410754 2.838366 4.386447 4.015109 3.839394 
##       40      233      276      348      204       29      117       84 
## 3.302716 4.675285 5.142913 3.273886 3.054943 3.297139 4.072025 3.292340 
##      247      136      244      243      237      131      224      186 
## 3.248268 3.030034 5.000116 4.694515 3.627683 2.689925 4.573418 4.032752 
##      209        1      139      311      111      178      102       32 
## 3.979824 4.137018 3.159987 4.547731 3.824962 5.229537 4.254905 5.296930 
##       70      360      119      296       30      123      346      250 
## 3.459978 3.783255 3.741515 4.486760 4.643176 2.624155 2.649030 4.084902 
##      302       49      352      180       95      275       88       51 
## 2.909747 3.571477 2.840732 5.492620 3.881911 6.099057 4.109709 5.365912 
##      212       26      126      137      160       89      129      252 
## 2.872063 3.264254 2.771750 2.043371 2.908936 3.294706 3.176041 4.768330 
##      127      309      238      158      366       38      345       77 
## 2.964263 2.519922 3.702275 3.292374 4.684849 4.521233 2.785405 3.687742 
##       16      256      182       36      138      260      146      100 
## 3.070118 4.187580 4.108966 3.577899 2.454083 4.855766 5.434082 5.079508 
##      266       79      327       54       90      249      257       22 
## 2.938645 2.810212 4.178691 4.248484 2.616888 3.462445 4.603092 3.731071 
##       34      357      147      210      259      172      121      153 
## 5.526317 3.332458 5.762159 3.696665 3.966981 2.857597 4.180313 2.964263 
##      190      181      283      278       71       93        3       41 
## 3.675778 4.439442 4.222865 4.249329 3.544202 3.719884 4.748220 3.982190 
##      188       52       53      361      299      331      184      107 
## 4.495581 6.671763 4.234829 3.516927 3.079007 3.399851 4.562974 3.512128 
##      335      220       24       83      232       46      343      294 
## 3.765613 3.699876 4.581428 4.000644 4.518868 4.522011 3.593142 3.479277 
##      104       73      133      255      329      108      319       44 
## 4.386447 2.923266 2.778949 3.760036 3.145589 2.776549 4.242840 4.072836 
##       82      213      124      314      168      145      264      284 
## 4.570950 3.181686 2.886460 3.403062 3.150389 3.897965 6.087836 4.983285 
##      226      101      165      109      157       58      297      277 
## 4.910282 5.426849 4.344774 2.448473 3.526594 3.099793 2.877639 4.264572 
##      187      317       48      365      163      246      183       57 
## 4.124986 4.339941 3.207304 3.569889 3.980601 3.737560 4.182747 3.450345 
##      261      201      162       80      286      112      269       96 
## 5.875281 4.673662 4.035963 5.049833 4.355218 4.063981 4.408957 3.593919 
##      310      217       99      214       66      142       59      207 
## 3.684633 3.935718 5.706764 2.929012 4.060770 3.181618 3.628427 3.901210 
##       27       28      300      166      241      206        8      258 
## 3.159176 3.940483 3.785689 3.901987 3.314038 3.859503 2.790204 4.150673 
##      197       61      122      301       42      315      328      298 
## 3.936495 2.143615 3.285918 3.277131 3.763213 3.610007 3.727894 2.982751 
##      152       56       67       78       97      268      318      340 
## 4.209177 3.253844 4.891794 3.715862 5.458146 3.979824 4.101699 4.124175 
##      205      355       35      150      194      173       20      103 
## 3.334892 2.854352 3.419048 4.690493 3.903610 3.677400 3.100604 4.159460 
##       69      211      239      292      313      229      273      116 
## 2.924111 3.209772 4.462695 3.577933 4.285425 3.367777 4.510824 3.759934 
##      307       86      282      130       68       31       60      235 
## 3.155999 3.216126 4.638377 3.112670 3.589897 4.959221 2.822244 3.844261 
##       19      359      271      208      143      215      230      338 
## 3.423070 3.202505 5.478222 3.666179 3.821785 3.477688 3.528216 2.851919 
##      216      177      154      132        2      312      289      293 
## 3.796943 4.472328 4.290224 2.993938 4.758664 4.483549 4.481926 3.253878 
##        7      254 
## 3.342091 5.011371

# display the predictions
points(train$Cloud9am, predictedY, col = "blue", pch=4)

rmse <- function(error)
{
  sqrt(mean(error^2))
}
error <- model$residuals
predictionRMSE <- rmse(error)
predictionRMSE

## [1] 2.853694

#using Support Vector Machine
library(e1071)
model2 <- svm(train$Cloud9am ~ train$Evaporation+train$Pressure9am, train)

predictedY2 <- predict(model2, train)

points(train$Cloud9am, predictedY2, col = "red", pch=4)

error2 <- model2$residuals
predictionRMSE <- rmse(error2) 
predictionRMSE

## [1] 2.860998

#Tuning the Result
tuneResult <- tune(svm, train$Cloud9am ~ train$Evaporation+train$Pressure9am,  data = train,
                   ranges = list(epsilon = seq(0,0.2,0.01), cost = 2^(2:9))
) 

print(tuneResult)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##      0.2  128
## 
## - best performance: 7.963795

plot(tuneResult)

#the darker the region is the better our model is (because the RMSE is closer to zero in darker regions)
#this means we can try another grid and search in a narrower range

tuneResult <- tune(svm, train$Cloud9am ~ train$Evaporation+train$Pressure9am,  data = train,
                   ranges = list(epsilon = seq(0.18,0.2,0.01), cost = 2^(2:9))
)

print(tuneResult)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##      0.2  128
## 
## - best performance: 7.855223

plot(tuneResult)

#R allows us to get best model easily
tunedModel <- tuneResult$best.model
tunedModelY <- predict(tunedModel, train)
error <- tuneResult$best.model$residuals
tunedModelRMSE <- rmse(error)
tunedModelRMSE #the tune method  randomly shuffles the data

## [1] 2.762332

SVM_of_Weather_dataset_from_rattle.R

sanpande

Wed Nov 09 18:10:43 2016