suppressMessages(library("corrplot"))
suppressMessages(library("ggplot2"))
suppressMessages(library("e1071"))
suppressMessages(library("printr"))
suppressMessages(library('caret'))
suppressMessages(library('randomForest'))
suppressMessages(library('ROCR'))
suppressMessages(library('reshape2'))
#Preprocessing
##First step was to import the data in R and run descriptive statistics to identify any descripancies.
#Header names were inserted into dataset and saved in csv file.
df<-read.csv("phishing.csv",sep=",",header=T)
head(df)
| -1 |
1 |
1 |
1 |
-1 |
-1 |
-1 |
-1 |
-1 |
1 |
1 |
-1 |
1 |
-1 |
1 |
-1 |
-1 |
-1 |
0 |
1 |
1 |
1 |
1 |
-1 |
-1 |
-1 |
-1 |
1 |
1 |
-1 |
-1 |
| 1 |
1 |
1 |
1 |
1 |
-1 |
0 |
1 |
-1 |
1 |
1 |
-1 |
1 |
0 |
-1 |
-1 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
-1 |
-1 |
0 |
-1 |
1 |
1 |
1 |
-1 |
| 1 |
0 |
1 |
1 |
1 |
-1 |
-1 |
-1 |
-1 |
1 |
1 |
-1 |
1 |
0 |
-1 |
-1 |
-1 |
-1 |
0 |
1 |
1 |
1 |
1 |
1 |
-1 |
1 |
-1 |
1 |
0 |
-1 |
-1 |
| 1 |
0 |
1 |
1 |
1 |
-1 |
-1 |
-1 |
1 |
1 |
1 |
-1 |
-1 |
0 |
0 |
-1 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
-1 |
-1 |
1 |
-1 |
1 |
-1 |
1 |
-1 |
| 1 |
0 |
-1 |
1 |
1 |
-1 |
1 |
1 |
-1 |
1 |
1 |
1 |
1 |
0 |
0 |
-1 |
1 |
1 |
0 |
-1 |
1 |
-1 |
1 |
-1 |
-1 |
0 |
-1 |
1 |
1 |
1 |
1 |
| -1 |
0 |
-1 |
1 |
-1 |
-1 |
1 |
1 |
-1 |
1 |
1 |
-1 |
1 |
0 |
0 |
-1 |
-1 |
-1 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
-1 |
1 |
-1 |
-1 |
1 |
#Descriptive statistics
str(df)
## 'data.frame': 11055 obs. of 31 variables:
## $ IP_Address : int -1 1 1 1 1 -1 1 1 1 1 ...
## $ URL_Length : int 1 1 0 0 0 0 0 0 0 1 ...
## $ Shortining_Service : int 1 1 1 1 -1 -1 -1 1 -1 -1 ...
## $ having_At_Symbol : int 1 1 1 1 1 1 1 1 1 1 ...
## $ double_slash_redirecting : int -1 1 1 1 1 -1 1 1 1 1 ...
## $ Prefix_Suffix : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ having_Sub_Domain : int -1 0 -1 -1 1 1 -1 -1 1 -1 ...
## $ SSLfinal_State : int -1 1 -1 -1 1 1 -1 -1 1 1 ...
## $ Domain_registeration_length: int -1 -1 -1 1 -1 -1 1 1 -1 -1 ...
## $ Favicon : int 1 1 1 1 1 1 1 1 1 1 ...
## $ port : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HTTPS_token : int -1 -1 -1 -1 1 -1 1 -1 -1 1 ...
## $ Request_URL : int 1 1 1 -1 1 1 -1 -1 1 1 ...
## $ URL_of_Anchor : int -1 0 0 0 0 0 -1 0 0 0 ...
## $ Links_in_tags : int 1 -1 -1 0 0 0 0 -1 1 1 ...
## $ SFH : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ Submitting_to_email : int -1 1 -1 1 1 -1 -1 1 1 1 ...
## $ Abnormal_URL : int -1 1 -1 1 1 -1 -1 1 1 1 ...
## $ Redirect : int 0 0 0 0 0 0 0 0 0 0 ...
## $ on_mouseover : int 1 1 1 1 -1 1 1 1 1 1 ...
## $ RightClick : int 1 1 1 1 1 1 1 1 1 1 ...
## $ popUpWidnow : int 1 1 1 1 -1 1 1 1 1 1 ...
## $ Iframe : int 1 1 1 1 1 1 1 1 1 1 ...
## $ age_of_domain : int -1 -1 1 -1 -1 1 1 -1 1 1 ...
## $ DNSRecord : int -1 -1 -1 -1 -1 1 -1 -1 -1 -1 ...
## $ web_traffic : int -1 0 1 1 0 1 -1 0 1 0 ...
## $ Page_Rank : int -1 -1 -1 -1 -1 -1 -1 -1 1 -1 ...
## $ Google_Index : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Links_pointing_to_page : int 1 1 0 -1 1 -1 0 0 0 0 ...
## $ Statistical_report : int -1 1 -1 1 1 -1 -1 1 1 1 ...
## $ Result : int -1 -1 -1 -1 1 1 -1 -1 1 -1 ...
summary(df)
|
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.000 |
Min. :-1.00000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.00000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :0.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.00000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.0000 |
Min. :-1.000 |
Min. :-1.0000 |
Min. :-1.0000 |
|
1st Qu.:-1.0000 |
1st Qu.:-1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.:-1.000 |
1st Qu.:-1.00000 |
1st Qu.:-1.0000 |
1st Qu.:-1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.:-1.0000 |
1st Qu.:-1.00000 |
1st Qu.:-1.0000 |
1st Qu.:-1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.:0.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 1.0000 |
1st Qu.:-1.00000 |
1st Qu.:-1.0000 |
1st Qu.: 0.0000 |
1st Qu.:-1.0000 |
1st Qu.: 1.0000 |
1st Qu.: 0.000 |
1st Qu.: 1.0000 |
1st Qu.:-1.0000 |
|
Median : 1.0000 |
Median :-1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median :-1.000 |
Median : 0.00000 |
Median : 1.0000 |
Median :-1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 0.00000 |
Median : 0.0000 |
Median :-1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median :0.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 1.0000 |
Median : 1.00000 |
Median : 1.0000 |
Median : 1.0000 |
Median :-1.0000 |
Median : 1.0000 |
Median : 0.000 |
Median : 1.0000 |
Median : 1.0000 |
|
Mean : 0.3138 |
Mean :-0.6332 |
Mean : 0.7388 |
Mean : 0.7006 |
Mean : 0.7415 |
Mean :-0.735 |
Mean : 0.06395 |
Mean : 0.2509 |
Mean :-0.3368 |
Mean : 0.6286 |
Mean : 0.7283 |
Mean : 0.6751 |
Mean : 0.1868 |
Mean :-0.07653 |
Mean :-0.1181 |
Mean :-0.5957 |
Mean : 0.6356 |
Mean : 0.7053 |
Mean :0.1157 |
Mean : 0.7621 |
Mean : 0.9139 |
Mean : 0.6134 |
Mean : 0.8169 |
Mean : 0.06124 |
Mean : 0.3771 |
Mean : 0.2873 |
Mean :-0.4837 |
Mean : 0.7216 |
Mean : 0.344 |
Mean : 0.7196 |
Mean : 0.1139 |
|
3rd Qu.: 1.0000 |
3rd Qu.:-1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.:-1.000 |
3rd Qu.: 1.00000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 0.00000 |
3rd Qu.: 0.0000 |
3rd Qu.:-1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.:0.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.00000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.000 |
3rd Qu.: 1.0000 |
3rd Qu.: 1.0000 |
|
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.000 |
Max. : 1.00000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.00000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. :1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.00000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.0000 |
Max. : 1.000 |
Max. : 1.0000 |
Max. : 1.0000 |
#There are 11055 observations for 30 attributes, last attribute is the predictor variable.
# above command shows if the data is set the way explained in the link, if not more processing has to be done, for eg. if the column names are not correct according to the data field, it needs to be reorganized and reassigned using R or excel and saved as csv.
a<-lapply(df[, c("IP_Address","URL_Length","Shortining_Service",
"having_At_Symbol","double_slash_redirecting",
"Prefix_Suffix","having_Sub_Domain","SSLfinal_State",
"Domain_registeration_length","Favicon","port",
"HTTPS_token","Request_URL","URL_of_Anchor",
"Links_in_tags","SFH","Submitting_to_email",
"Abnormal_URL","Redirect","on_mouseover",
"RightClick","popUpWidnow","Iframe","age_of_domain",
"DNSRecord","web_traffic","Page_Rank","Google_Index",
"Links_pointing_to_page","Statistical_report", "Result")], table)
#Visualization
#Checking the score distribution
df1<-df[1:10]
boxplot(df1)

df2<-df[11:20]
boxplot(df2)

df3<-df[21:30]
boxplot(df3)

#correlation plot
cmatrix<-cor(df)
round(cmatrix,2)
| IP_Address |
1.00 |
-0.05 |
0.40 |
0.16 |
0.40 |
-0.01 |
-0.08 |
0.07 |
-0.02 |
0.09 |
0.06 |
0.36 |
0.03 |
0.10 |
0.01 |
-0.01 |
0.08 |
0.34 |
-0.32 |
0.08 |
0.04 |
0.10 |
0.05 |
-0.01 |
-0.05 |
0.00 |
-0.09 |
0.03 |
-0.34 |
-0.02 |
0.09 |
| URL_Length |
-0.05 |
1.00 |
-0.10 |
-0.08 |
-0.08 |
0.06 |
0.00 |
0.05 |
-0.22 |
-0.04 |
0.00 |
-0.09 |
0.25 |
-0.02 |
0.05 |
0.41 |
-0.01 |
-0.11 |
0.05 |
-0.05 |
-0.01 |
-0.05 |
-0.01 |
0.18 |
-0.04 |
0.01 |
0.18 |
0.00 |
-0.02 |
-0.07 |
0.06 |
| Shortining_Service |
0.40 |
-0.10 |
1.00 |
0.10 |
0.84 |
-0.08 |
-0.04 |
-0.06 |
0.06 |
0.01 |
0.00 |
0.76 |
-0.04 |
0.00 |
-0.13 |
-0.02 |
0.05 |
0.74 |
-0.53 |
0.06 |
0.04 |
0.04 |
0.02 |
-0.05 |
0.44 |
-0.05 |
0.01 |
0.16 |
-0.20 |
0.09 |
-0.07 |
| having_At_Symbol |
0.16 |
-0.08 |
0.10 |
1.00 |
0.09 |
-0.01 |
-0.06 |
0.03 |
0.02 |
0.30 |
0.36 |
0.10 |
0.03 |
0.06 |
-0.07 |
-0.01 |
0.37 |
0.20 |
-0.03 |
0.28 |
0.22 |
0.29 |
0.28 |
-0.01 |
-0.05 |
0.03 |
-0.06 |
0.04 |
-0.01 |
-0.08 |
0.05 |
| double_slash_redirecting |
0.40 |
-0.08 |
0.84 |
0.09 |
1.00 |
-0.09 |
-0.04 |
-0.04 |
0.05 |
0.04 |
0.03 |
0.76 |
-0.03 |
-0.01 |
-0.13 |
-0.04 |
0.03 |
0.72 |
-0.59 |
0.09 |
0.03 |
0.05 |
0.01 |
-0.05 |
0.43 |
-0.06 |
0.00 |
0.18 |
-0.19 |
0.07 |
-0.04 |
| Prefix_Suffix |
-0.01 |
0.06 |
-0.08 |
-0.01 |
-0.09 |
1.00 |
0.09 |
0.26 |
-0.10 |
-0.01 |
-0.02 |
-0.07 |
0.10 |
0.35 |
0.10 |
0.00 |
-0.05 |
-0.08 |
0.02 |
0.01 |
-0.02 |
-0.01 |
-0.04 |
0.07 |
-0.02 |
0.11 |
-0.01 |
0.07 |
0.07 |
0.00 |
0.35 |
| having_Sub_Domain |
-0.08 |
0.00 |
-0.04 |
-0.06 |
-0.04 |
0.09 |
1.00 |
0.27 |
-0.08 |
-0.02 |
0.00 |
-0.04 |
0.10 |
0.23 |
0.09 |
0.10 |
0.01 |
-0.03 |
0.03 |
-0.02 |
0.02 |
-0.03 |
0.01 |
0.12 |
0.13 |
-0.01 |
0.12 |
0.06 |
-0.01 |
0.08 |
0.30 |
| SSLfinal_State |
0.07 |
0.05 |
-0.06 |
0.03 |
-0.04 |
0.26 |
0.27 |
1.00 |
-0.19 |
-0.01 |
0.03 |
-0.03 |
0.19 |
0.54 |
0.18 |
0.17 |
0.01 |
-0.05 |
-0.02 |
0.02 |
0.02 |
-0.01 |
0.00 |
0.16 |
0.05 |
0.26 |
0.07 |
0.10 |
-0.01 |
0.06 |
0.71 |
| Domain_registeration_length |
-0.02 |
-0.22 |
0.06 |
0.02 |
0.05 |
-0.10 |
-0.08 |
-0.19 |
1.00 |
0.05 |
0.02 |
0.06 |
-0.61 |
-0.16 |
-0.10 |
-0.14 |
0.04 |
0.06 |
-0.02 |
0.02 |
0.02 |
0.05 |
0.00 |
-0.06 |
-0.01 |
-0.13 |
-0.06 |
-0.04 |
0.12 |
0.00 |
-0.23 |
| Favicon |
0.09 |
-0.04 |
0.01 |
0.30 |
0.04 |
-0.01 |
-0.02 |
-0.01 |
0.05 |
1.00 |
0.80 |
0.05 |
0.00 |
0.04 |
-0.10 |
-0.01 |
0.67 |
0.07 |
-0.02 |
0.71 |
0.41 |
0.94 |
0.63 |
0.00 |
0.09 |
-0.05 |
0.01 |
-0.02 |
-0.13 |
0.30 |
0.00 |
| port |
0.06 |
0.00 |
0.00 |
0.36 |
0.03 |
-0.02 |
0.00 |
0.03 |
0.02 |
0.80 |
1.00 |
0.00 |
0.03 |
0.04 |
-0.07 |
0.01 |
0.80 |
0.05 |
-0.02 |
0.62 |
0.48 |
0.75 |
0.69 |
0.01 |
0.05 |
-0.03 |
0.02 |
-0.01 |
-0.14 |
0.34 |
0.04 |
| HTTPS_token |
0.36 |
-0.09 |
0.76 |
0.10 |
0.76 |
-0.07 |
-0.04 |
-0.03 |
0.06 |
0.05 |
0.00 |
1.00 |
-0.01 |
0.01 |
-0.10 |
-0.01 |
0.08 |
0.72 |
-0.46 |
0.11 |
0.01 |
0.07 |
0.02 |
-0.05 |
0.40 |
-0.04 |
0.02 |
0.12 |
-0.13 |
0.10 |
-0.04 |
| Request_URL |
0.03 |
0.25 |
-0.04 |
0.03 |
-0.03 |
0.10 |
0.10 |
0.19 |
-0.61 |
0.00 |
0.03 |
-0.01 |
1.00 |
0.18 |
0.07 |
0.13 |
0.02 |
-0.04 |
0.00 |
0.01 |
-0.02 |
0.00 |
0.02 |
0.09 |
0.02 |
0.16 |
0.06 |
0.05 |
-0.07 |
0.04 |
0.25 |
| URL_of_Anchor |
0.10 |
-0.02 |
0.00 |
0.06 |
-0.01 |
0.35 |
0.23 |
0.54 |
-0.16 |
0.04 |
0.04 |
0.01 |
0.18 |
1.00 |
0.14 |
0.11 |
0.03 |
-0.01 |
0.00 |
0.07 |
0.02 |
0.04 |
0.01 |
0.08 |
0.09 |
0.33 |
0.10 |
0.04 |
0.02 |
0.08 |
0.69 |
| Links_in_tags |
0.01 |
0.05 |
-0.13 |
-0.07 |
-0.13 |
0.10 |
0.09 |
0.18 |
-0.10 |
-0.10 |
-0.07 |
-0.10 |
0.07 |
0.14 |
1.00 |
0.07 |
-0.04 |
-0.12 |
0.04 |
-0.08 |
-0.04 |
-0.11 |
-0.07 |
0.08 |
-0.04 |
0.06 |
-0.01 |
0.05 |
0.01 |
-0.09 |
0.25 |
| SFH |
-0.01 |
0.41 |
-0.02 |
-0.01 |
-0.04 |
0.00 |
0.10 |
0.17 |
-0.14 |
-0.01 |
0.01 |
-0.01 |
0.13 |
0.11 |
0.07 |
1.00 |
0.01 |
-0.03 |
0.05 |
0.01 |
0.01 |
0.00 |
0.01 |
-0.02 |
0.03 |
0.05 |
0.00 |
0.03 |
-0.01 |
-0.01 |
0.22 |
| Submitting_to_email |
0.08 |
-0.01 |
0.05 |
0.37 |
0.03 |
-0.05 |
0.01 |
0.01 |
0.04 |
0.67 |
0.80 |
0.08 |
0.02 |
0.03 |
-0.04 |
0.01 |
1.00 |
0.20 |
-0.01 |
0.53 |
0.40 |
0.63 |
0.58 |
0.01 |
0.06 |
-0.02 |
0.03 |
-0.01 |
-0.04 |
0.35 |
0.02 |
| Abnormal_URL |
0.34 |
-0.11 |
0.74 |
0.20 |
0.72 |
-0.08 |
-0.03 |
-0.05 |
0.06 |
0.07 |
0.05 |
0.72 |
-0.04 |
-0.01 |
-0.12 |
-0.03 |
0.20 |
1.00 |
-0.46 |
0.12 |
0.02 |
0.09 |
0.02 |
-0.03 |
0.37 |
-0.05 |
0.01 |
0.12 |
-0.16 |
0.19 |
-0.06 |
| Redirect |
-0.32 |
0.05 |
-0.53 |
-0.03 |
-0.59 |
0.02 |
0.03 |
-0.02 |
-0.02 |
-0.02 |
-0.02 |
-0.46 |
0.00 |
0.00 |
0.04 |
0.05 |
-0.01 |
-0.46 |
1.00 |
-0.03 |
-0.02 |
-0.03 |
-0.01 |
-0.02 |
-0.21 |
0.00 |
0.05 |
0.06 |
0.16 |
-0.06 |
-0.02 |
| on_mouseover |
0.08 |
-0.05 |
0.06 |
0.28 |
0.09 |
0.01 |
-0.02 |
0.02 |
0.02 |
0.71 |
0.62 |
0.11 |
0.01 |
0.07 |
-0.08 |
0.01 |
0.53 |
0.12 |
-0.03 |
1.00 |
0.47 |
0.73 |
0.66 |
0.01 |
0.09 |
-0.04 |
0.02 |
-0.01 |
-0.04 |
0.28 |
0.04 |
| RightClick |
0.04 |
-0.01 |
0.04 |
0.22 |
0.03 |
-0.02 |
0.02 |
0.02 |
0.02 |
0.41 |
0.48 |
0.01 |
-0.02 |
0.02 |
-0.04 |
0.01 |
0.40 |
0.02 |
-0.02 |
0.47 |
1.00 |
0.42 |
0.66 |
0.01 |
0.04 |
-0.01 |
0.03 |
-0.01 |
-0.12 |
0.20 |
0.01 |
| popUpWidnow |
0.10 |
-0.05 |
0.04 |
0.29 |
0.05 |
-0.01 |
-0.03 |
-0.01 |
0.05 |
0.94 |
0.75 |
0.07 |
0.00 |
0.04 |
-0.11 |
0.00 |
0.63 |
0.09 |
-0.03 |
0.73 |
0.42 |
1.00 |
0.63 |
0.00 |
0.10 |
-0.04 |
0.02 |
-0.01 |
-0.12 |
0.29 |
0.00 |
| Iframe |
0.05 |
-0.01 |
0.02 |
0.28 |
0.01 |
-0.04 |
0.01 |
0.00 |
0.00 |
0.63 |
0.69 |
0.02 |
0.02 |
0.01 |
-0.07 |
0.01 |
0.58 |
0.02 |
-0.01 |
0.66 |
0.66 |
0.63 |
1.00 |
0.02 |
0.05 |
-0.02 |
0.02 |
0.00 |
-0.14 |
0.27 |
0.00 |
| age_of_domain |
-0.01 |
0.18 |
-0.05 |
-0.01 |
-0.05 |
0.07 |
0.12 |
0.16 |
-0.06 |
0.00 |
0.01 |
-0.05 |
0.09 |
0.08 |
0.08 |
-0.02 |
0.01 |
-0.03 |
-0.02 |
0.01 |
0.01 |
0.00 |
0.02 |
1.00 |
-0.03 |
0.09 |
-0.15 |
-0.03 |
0.04 |
0.01 |
0.12 |
| DNSRecord |
-0.05 |
-0.04 |
0.44 |
-0.05 |
0.43 |
-0.02 |
0.13 |
0.05 |
-0.01 |
0.09 |
0.05 |
0.40 |
0.02 |
0.09 |
-0.04 |
0.03 |
0.06 |
0.37 |
-0.21 |
0.09 |
0.04 |
0.10 |
0.05 |
-0.03 |
1.00 |
0.05 |
0.14 |
0.14 |
-0.32 |
0.14 |
0.08 |
| web_traffic |
0.00 |
0.01 |
-0.05 |
0.03 |
-0.06 |
0.11 |
-0.01 |
0.26 |
-0.13 |
-0.05 |
-0.03 |
-0.04 |
0.16 |
0.33 |
0.06 |
0.05 |
-0.02 |
-0.05 |
0.00 |
-0.04 |
-0.01 |
-0.04 |
-0.02 |
0.09 |
0.05 |
1.00 |
0.03 |
-0.01 |
-0.02 |
0.01 |
0.35 |
| Page_Rank |
-0.09 |
0.18 |
0.01 |
-0.06 |
0.00 |
-0.01 |
0.12 |
0.07 |
-0.06 |
0.01 |
0.02 |
0.02 |
0.06 |
0.10 |
-0.01 |
0.00 |
0.03 |
0.01 |
0.05 |
0.02 |
0.03 |
0.02 |
0.02 |
-0.15 |
0.14 |
0.03 |
1.00 |
0.03 |
-0.03 |
0.03 |
0.10 |
| Google_Index |
0.03 |
0.00 |
0.16 |
0.04 |
0.18 |
0.07 |
0.06 |
0.10 |
-0.04 |
-0.02 |
-0.01 |
0.12 |
0.05 |
0.04 |
0.05 |
0.03 |
-0.01 |
0.12 |
0.06 |
-0.01 |
-0.01 |
-0.01 |
0.00 |
-0.03 |
0.14 |
-0.01 |
0.03 |
1.00 |
-0.04 |
-0.01 |
0.13 |
| Links_pointing_to_page |
-0.34 |
-0.02 |
-0.20 |
-0.01 |
-0.19 |
0.07 |
-0.01 |
-0.01 |
0.12 |
-0.13 |
-0.14 |
-0.13 |
-0.07 |
0.02 |
0.01 |
-0.01 |
-0.04 |
-0.16 |
0.16 |
-0.04 |
-0.12 |
-0.12 |
-0.14 |
0.04 |
-0.32 |
-0.02 |
-0.03 |
-0.04 |
1.00 |
-0.02 |
0.03 |
| Statistical_report |
-0.02 |
-0.07 |
0.09 |
-0.08 |
0.07 |
0.00 |
0.08 |
0.06 |
0.00 |
0.30 |
0.34 |
0.10 |
0.04 |
0.08 |
-0.09 |
-0.01 |
0.35 |
0.19 |
-0.06 |
0.28 |
0.20 |
0.29 |
0.27 |
0.01 |
0.14 |
0.01 |
0.03 |
-0.01 |
-0.02 |
1.00 |
0.08 |
| Result |
0.09 |
0.06 |
-0.07 |
0.05 |
-0.04 |
0.35 |
0.30 |
0.71 |
-0.23 |
0.00 |
0.04 |
-0.04 |
0.25 |
0.69 |
0.25 |
0.22 |
0.02 |
-0.06 |
-0.02 |
0.04 |
0.01 |
0.00 |
0.00 |
0.12 |
0.08 |
0.35 |
0.10 |
0.13 |
0.03 |
0.08 |
1.00 |
colnames(df)<-c(1:31)
corrplot(cor(df), method="shade",shade.col=NA, tl.col="black", tl.srt=45)

colnames(df)<-c(1:31)
a<-melt(df)
## No id variables; using all as measure variables
ggplot(a, aes(x = variable,fill=value))+geom_bar(position="identity",fill="#FF9999", colour="black")+facet_grid(value~.)+ggtitle("Phishing Database Data Distribution")

df<-read.csv("phishing.csv",sep=",",header=T,colClasses = "factor")
#Data Mining algorithm-1 - Naive Bayes
smp_size <- floor(0.80 * nrow(df))
## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
train <- df[train_ind, ]
test <- df[-train_ind, ]
nb_model<-naiveBayes(Result~.,data = train)
nb_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## -1 1
## 0.4448213 0.5551787
##
## Conditional probabilities:
## IP_Address
## Y -1 1
## -1 0.3957804 0.6042196
## 1 0.3034623 0.6965377
##
## URL_Length
## Y -1 0 1
## -1 0.83706151 0.01626843 0.14667006
## 1 0.79042770 0.00814664 0.20142566
##
## Shortining_Service
## Y -1 1
## -1 0.1026945 0.8973055
## 1 0.1537678 0.8462322
##
## having_At_Symbol
## Y -1 1
## -1 0.1725979 0.8274021
## 1 0.1309572 0.8690428
##
## double_slash_redirecting
## Y -1 1
## -1 0.1123538 0.8876462
## 1 0.1425662 0.8574338
##
## Prefix_Suffix
## Y -1 1
## -1 1.0000000 0.0000000
## 1 0.7647658 0.2352342
##
## having_Sub_Domain
## Y -1 0 1
## -1 0.3797661 0.4600915 0.1601423
## 1 0.2415479 0.2224033 0.5360489
##
## SSLfinal_State
## Y -1 0 1
## -1 0.62048805 0.23690900 0.14260295
## 1 0.08167006 0.00305499 0.91527495
##
## Domain_registeration_length
## Y -1 1
## -1 0.5495679 0.4504321
## 1 0.7663951 0.2336049
##
## Favicon
## Y -1 1
## -1 0.1911540 0.8088460
## 1 0.1855397 0.8144603
##
## port
## Y -1 1
## -1 0.1548043 0.8451957
## 1 0.1242363 0.8757637
##
## HTTPS_token
## Y -1 1
## -1 0.1464159 0.8535841
## 1 0.1749491 0.8250509
##
## Request_URL
## Y -1 1
## -1 0.5485511 0.4514489
## 1 0.2930754 0.7069246
##
## URL_of_Anchor
## Y -1 0 1
## -1 0.663446873 0.305287239 0.031265887
## 1 0.006720978 0.622810591 0.370468432
##
## Links_in_tags
## Y -1 0 1
## -1 0.4850025 0.3581596 0.1568378
## 1 0.2537678 0.4429735 0.3032587
##
## SFH
## Y -1 0 1
## -1 0.86731063 0.05541434 0.07727504
## 1 0.68268839 0.08228106 0.23503055
##
## Submitting_to_email
## Y -1 1
## -1 0.1970005 0.8029995
## 1 0.1741344 0.8258656
##
## Abnormal_URL
## Y -1 1
## -1 0.1245552 0.8754448
## 1 0.1676171 0.8323829
##
## Redirect
## Y 0 1
## -1 0.8746823 0.1253177
## 1 0.8890020 0.1109980
##
## on_mouseover
## Y -1 1
## -1 0.1365023 0.8634977
## 1 0.1067210 0.8932790
##
## RightClick
## Y -1 1
## -1 0.04778851 0.95221149
## 1 0.04114053 0.95885947
##
## popUpWidnow
## Y -1 1
## -1 0.1967463 0.8032537
## 1 0.1928717 0.8071283
##
## Iframe
## Y -1 1
## -1 0.09125572 0.90874428
## 1 0.09246436 0.90753564
##
## age_of_domain
## Y -1 1
## -1 0.5391459 0.4608541
## 1 0.4177189 0.5822811
##
## DNSRecord
## Y -1 1
## -1 0.3510422 0.6489578
## 1 0.2818737 0.7181263
##
## web_traffic
## Y -1 0 1
## -1 0.3446873 0.3482461 0.3070666
## 1 0.1592668 0.1403259 0.7004073
##
## Page_Rank
## Y -1 1
## -1 0.7890188 0.2109812
## 1 0.7036660 0.2963340
##
## Google_Index
## Y -1 1
## -1 0.1924250 0.8075750
## 1 0.1032587 0.8967413
##
## Links_pointing_to_page
## Y -1 0 1
## -1 0.03914591 0.59379766 0.36705643
## 1 0.05682281 0.52403259 0.41914460
##
## Statistical_report
## Y -1 1
## -1 0.1736146 0.8263854
## 1 0.1175153 0.8824847
str(nb_model)
## List of 4
## $ apriori: 'table' int [1:2(1d)] 3934 4910
## ..- attr(*, "dimnames")=List of 1
## .. ..$ Y: chr [1:2] "-1" "1"
## $ tables :List of 30
## ..$ IP_Address : table [1:2, 1:2] 0.396 0.303 0.604 0.697
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ IP_Address: chr [1:2] "-1" "1"
## ..$ URL_Length : table [1:2, 1:3] 0.83706 0.79043 0.01627 0.00815 0.14667 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ URL_Length: chr [1:3] "-1" "0" "1"
## ..$ Shortining_Service : table [1:2, 1:2] 0.103 0.154 0.897 0.846
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Shortining_Service: chr [1:2] "-1" "1"
## ..$ having_At_Symbol : table [1:2, 1:2] 0.173 0.131 0.827 0.869
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ having_At_Symbol: chr [1:2] "-1" "1"
## ..$ double_slash_redirecting : table [1:2, 1:2] 0.112 0.143 0.888 0.857
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ double_slash_redirecting: chr [1:2] "-1" "1"
## ..$ Prefix_Suffix : table [1:2, 1:2] 1 0.765 0 0.235
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Prefix_Suffix: chr [1:2] "-1" "1"
## ..$ having_Sub_Domain : table [1:2, 1:3] 0.38 0.242 0.46 0.222 0.16 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ having_Sub_Domain: chr [1:3] "-1" "0" "1"
## ..$ SSLfinal_State : table [1:2, 1:3] 0.62049 0.08167 0.23691 0.00305 0.1426 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ SSLfinal_State: chr [1:3] "-1" "0" "1"
## ..$ Domain_registeration_length: table [1:2, 1:2] 0.55 0.766 0.45 0.234
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Domain_registeration_length: chr [1:2] "-1" "1"
## ..$ Favicon : table [1:2, 1:2] 0.191 0.186 0.809 0.814
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Favicon: chr [1:2] "-1" "1"
## ..$ port : table [1:2, 1:2] 0.155 0.124 0.845 0.876
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ port: chr [1:2] "-1" "1"
## ..$ HTTPS_token : table [1:2, 1:2] 0.146 0.175 0.854 0.825
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ HTTPS_token: chr [1:2] "-1" "1"
## ..$ Request_URL : table [1:2, 1:2] 0.549 0.293 0.451 0.707
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Request_URL: chr [1:2] "-1" "1"
## ..$ URL_of_Anchor : table [1:2, 1:3] 0.66345 0.00672 0.30529 0.62281 0.03127 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ URL_of_Anchor: chr [1:3] "-1" "0" "1"
## ..$ Links_in_tags : table [1:2, 1:3] 0.485 0.254 0.358 0.443 0.157 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Links_in_tags: chr [1:3] "-1" "0" "1"
## ..$ SFH : table [1:2, 1:3] 0.8673 0.6827 0.0554 0.0823 0.0773 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ SFH: chr [1:3] "-1" "0" "1"
## ..$ Submitting_to_email : table [1:2, 1:2] 0.197 0.174 0.803 0.826
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Submitting_to_email: chr [1:2] "-1" "1"
## ..$ Abnormal_URL : table [1:2, 1:2] 0.125 0.168 0.875 0.832
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Abnormal_URL: chr [1:2] "-1" "1"
## ..$ Redirect : table [1:2, 1:2] 0.875 0.889 0.125 0.111
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Redirect: chr [1:2] "0" "1"
## ..$ on_mouseover : table [1:2, 1:2] 0.137 0.107 0.863 0.893
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ on_mouseover: chr [1:2] "-1" "1"
## ..$ RightClick : table [1:2, 1:2] 0.0478 0.0411 0.9522 0.9589
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ RightClick: chr [1:2] "-1" "1"
## ..$ popUpWidnow : table [1:2, 1:2] 0.197 0.193 0.803 0.807
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ popUpWidnow: chr [1:2] "-1" "1"
## ..$ Iframe : table [1:2, 1:2] 0.0913 0.0925 0.9087 0.9075
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Iframe: chr [1:2] "-1" "1"
## ..$ age_of_domain : table [1:2, 1:2] 0.539 0.418 0.461 0.582
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ age_of_domain: chr [1:2] "-1" "1"
## ..$ DNSRecord : table [1:2, 1:2] 0.351 0.282 0.649 0.718
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ DNSRecord: chr [1:2] "-1" "1"
## ..$ web_traffic : table [1:2, 1:3] 0.345 0.159 0.348 0.14 0.307 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ web_traffic: chr [1:3] "-1" "0" "1"
## ..$ Page_Rank : table [1:2, 1:2] 0.789 0.704 0.211 0.296
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Page_Rank: chr [1:2] "-1" "1"
## ..$ Google_Index : table [1:2, 1:2] 0.192 0.103 0.808 0.897
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Google_Index: chr [1:2] "-1" "1"
## ..$ Links_pointing_to_page : table [1:2, 1:3] 0.0391 0.0568 0.5938 0.524 0.3671 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Links_pointing_to_page: chr [1:3] "-1" "0" "1"
## ..$ Statistical_report : table [1:2, 1:2] 0.174 0.118 0.826 0.882
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ Y : chr [1:2] "-1" "1"
## .. .. ..$ Statistical_report: chr [1:2] "-1" "1"
## $ levels : chr [1:2] "-1" "1"
## $ call : language naiveBayes.default(x = X, y = Y, laplace = laplace)
## - attr(*, "class")= chr "naiveBayes"
output<-predict(nb_model,test[,-31])
table(pred=output,true=test$Result)
confusionMatrix(output, test$Result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction -1 1
## -1 868 58
## 1 96 1189
##
## Accuracy : 0.9303
## 95% CI : (0.9189, 0.9406)
## No Information Rate : 0.564
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8577
## Mcnemar's Test P-Value : 0.002868
##
## Sensitivity : 0.9004
## Specificity : 0.9535
## Pos Pred Value : 0.9374
## Neg Pred Value : 0.9253
## Prevalence : 0.4360
## Detection Rate : 0.3926
## Detection Prevalence : 0.4188
## Balanced Accuracy : 0.9270
##
## 'Positive' Class : -1
##
# trainControl for Random Forest
fitControl = trainControl(method = "repeatedcv", repeats = 5,
number = 5, verboseIter = T)
# Run a Random Forest classification over the training set
rf.fit <- train(Result ~ ., data = train, method = "rf",
importance = T, trControl = fitControl,
tuneLength = 5)
## + Fold1.Rep1: mtry= 2
## - Fold1.Rep1: mtry= 2
## + Fold1.Rep1: mtry=11
## - Fold1.Rep1: mtry=11
## + Fold1.Rep1: mtry=20
## - Fold1.Rep1: mtry=20
## + Fold1.Rep1: mtry=29
## - Fold1.Rep1: mtry=29
## + Fold1.Rep1: mtry=38
## - Fold1.Rep1: mtry=38
## + Fold2.Rep1: mtry= 2
## - Fold2.Rep1: mtry= 2
## + Fold2.Rep1: mtry=11
## - Fold2.Rep1: mtry=11
## + Fold2.Rep1: mtry=20
## - Fold2.Rep1: mtry=20
## + Fold2.Rep1: mtry=29
## - Fold2.Rep1: mtry=29
## + Fold2.Rep1: mtry=38
## - Fold2.Rep1: mtry=38
## + Fold3.Rep1: mtry= 2
## - Fold3.Rep1: mtry= 2
## + Fold3.Rep1: mtry=11
## - Fold3.Rep1: mtry=11
## + Fold3.Rep1: mtry=20
## - Fold3.Rep1: mtry=20
## + Fold3.Rep1: mtry=29
## - Fold3.Rep1: mtry=29
## + Fold3.Rep1: mtry=38
## - Fold3.Rep1: mtry=38
## + Fold4.Rep1: mtry= 2
## - Fold4.Rep1: mtry= 2
## + Fold4.Rep1: mtry=11
## - Fold4.Rep1: mtry=11
## + Fold4.Rep1: mtry=20
## - Fold4.Rep1: mtry=20
## + Fold4.Rep1: mtry=29
## - Fold4.Rep1: mtry=29
## + Fold4.Rep1: mtry=38
## - Fold4.Rep1: mtry=38
## + Fold5.Rep1: mtry= 2
## - Fold5.Rep1: mtry= 2
## + Fold5.Rep1: mtry=11
## - Fold5.Rep1: mtry=11
## + Fold5.Rep1: mtry=20
## - Fold5.Rep1: mtry=20
## + Fold5.Rep1: mtry=29
## - Fold5.Rep1: mtry=29
## + Fold5.Rep1: mtry=38
## - Fold5.Rep1: mtry=38
## + Fold1.Rep2: mtry= 2
## - Fold1.Rep2: mtry= 2
## + Fold1.Rep2: mtry=11
## - Fold1.Rep2: mtry=11
## + Fold1.Rep2: mtry=20
## - Fold1.Rep2: mtry=20
## + Fold1.Rep2: mtry=29
## - Fold1.Rep2: mtry=29
## + Fold1.Rep2: mtry=38
## - Fold1.Rep2: mtry=38
## + Fold2.Rep2: mtry= 2
## - Fold2.Rep2: mtry= 2
## + Fold2.Rep2: mtry=11
## - Fold2.Rep2: mtry=11
## + Fold2.Rep2: mtry=20
## - Fold2.Rep2: mtry=20
## + Fold2.Rep2: mtry=29
## - Fold2.Rep2: mtry=29
## + Fold2.Rep2: mtry=38
## - Fold2.Rep2: mtry=38
## + Fold3.Rep2: mtry= 2
## - Fold3.Rep2: mtry= 2
## + Fold3.Rep2: mtry=11
## - Fold3.Rep2: mtry=11
## + Fold3.Rep2: mtry=20
## - Fold3.Rep2: mtry=20
## + Fold3.Rep2: mtry=29
## - Fold3.Rep2: mtry=29
## + Fold3.Rep2: mtry=38
## - Fold3.Rep2: mtry=38
## + Fold4.Rep2: mtry= 2
## - Fold4.Rep2: mtry= 2
## + Fold4.Rep2: mtry=11
## - Fold4.Rep2: mtry=11
## + Fold4.Rep2: mtry=20
## - Fold4.Rep2: mtry=20
## + Fold4.Rep2: mtry=29
## - Fold4.Rep2: mtry=29
## + Fold4.Rep2: mtry=38
## - Fold4.Rep2: mtry=38
## + Fold5.Rep2: mtry= 2
## - Fold5.Rep2: mtry= 2
## + Fold5.Rep2: mtry=11
## - Fold5.Rep2: mtry=11
## + Fold5.Rep2: mtry=20
## - Fold5.Rep2: mtry=20
## + Fold5.Rep2: mtry=29
## - Fold5.Rep2: mtry=29
## + Fold5.Rep2: mtry=38
## - Fold5.Rep2: mtry=38
## + Fold1.Rep3: mtry= 2
## - Fold1.Rep3: mtry= 2
## + Fold1.Rep3: mtry=11
## - Fold1.Rep3: mtry=11
## + Fold1.Rep3: mtry=20
## - Fold1.Rep3: mtry=20
## + Fold1.Rep3: mtry=29
## - Fold1.Rep3: mtry=29
## + Fold1.Rep3: mtry=38
## - Fold1.Rep3: mtry=38
## + Fold2.Rep3: mtry= 2
## - Fold2.Rep3: mtry= 2
## + Fold2.Rep3: mtry=11
## - Fold2.Rep3: mtry=11
## + Fold2.Rep3: mtry=20
## - Fold2.Rep3: mtry=20
## + Fold2.Rep3: mtry=29
## - Fold2.Rep3: mtry=29
## + Fold2.Rep3: mtry=38
## - Fold2.Rep3: mtry=38
## + Fold3.Rep3: mtry= 2
## - Fold3.Rep3: mtry= 2
## + Fold3.Rep3: mtry=11
## - Fold3.Rep3: mtry=11
## + Fold3.Rep3: mtry=20
## - Fold3.Rep3: mtry=20
## + Fold3.Rep3: mtry=29
## - Fold3.Rep3: mtry=29
## + Fold3.Rep3: mtry=38
## - Fold3.Rep3: mtry=38
## + Fold4.Rep3: mtry= 2
## - Fold4.Rep3: mtry= 2
## + Fold4.Rep3: mtry=11
## - Fold4.Rep3: mtry=11
## + Fold4.Rep3: mtry=20
## - Fold4.Rep3: mtry=20
## + Fold4.Rep3: mtry=29
## - Fold4.Rep3: mtry=29
## + Fold4.Rep3: mtry=38
## - Fold4.Rep3: mtry=38
## + Fold5.Rep3: mtry= 2
## - Fold5.Rep3: mtry= 2
## + Fold5.Rep3: mtry=11
## - Fold5.Rep3: mtry=11
## + Fold5.Rep3: mtry=20
## - Fold5.Rep3: mtry=20
## + Fold5.Rep3: mtry=29
## - Fold5.Rep3: mtry=29
## + Fold5.Rep3: mtry=38
## - Fold5.Rep3: mtry=38
## + Fold1.Rep4: mtry= 2
## - Fold1.Rep4: mtry= 2
## + Fold1.Rep4: mtry=11
## - Fold1.Rep4: mtry=11
## + Fold1.Rep4: mtry=20
## - Fold1.Rep4: mtry=20
## + Fold1.Rep4: mtry=29
## - Fold1.Rep4: mtry=29
## + Fold1.Rep4: mtry=38
## - Fold1.Rep4: mtry=38
## + Fold2.Rep4: mtry= 2
## - Fold2.Rep4: mtry= 2
## + Fold2.Rep4: mtry=11
## - Fold2.Rep4: mtry=11
## + Fold2.Rep4: mtry=20
## - Fold2.Rep4: mtry=20
## + Fold2.Rep4: mtry=29
## - Fold2.Rep4: mtry=29
## + Fold2.Rep4: mtry=38
## - Fold2.Rep4: mtry=38
## + Fold3.Rep4: mtry= 2
## - Fold3.Rep4: mtry= 2
## + Fold3.Rep4: mtry=11
## - Fold3.Rep4: mtry=11
## + Fold3.Rep4: mtry=20
## - Fold3.Rep4: mtry=20
## + Fold3.Rep4: mtry=29
## - Fold3.Rep4: mtry=29
## + Fold3.Rep4: mtry=38
## - Fold3.Rep4: mtry=38
## + Fold4.Rep4: mtry= 2
## - Fold4.Rep4: mtry= 2
## + Fold4.Rep4: mtry=11
## - Fold4.Rep4: mtry=11
## + Fold4.Rep4: mtry=20
## - Fold4.Rep4: mtry=20
## + Fold4.Rep4: mtry=29
## - Fold4.Rep4: mtry=29
## + Fold4.Rep4: mtry=38
## - Fold4.Rep4: mtry=38
## + Fold5.Rep4: mtry= 2
## - Fold5.Rep4: mtry= 2
## + Fold5.Rep4: mtry=11
## - Fold5.Rep4: mtry=11
## + Fold5.Rep4: mtry=20
## - Fold5.Rep4: mtry=20
## + Fold5.Rep4: mtry=29
## - Fold5.Rep4: mtry=29
## + Fold5.Rep4: mtry=38
## - Fold5.Rep4: mtry=38
## + Fold1.Rep5: mtry= 2
## - Fold1.Rep5: mtry= 2
## + Fold1.Rep5: mtry=11
## - Fold1.Rep5: mtry=11
## + Fold1.Rep5: mtry=20
## - Fold1.Rep5: mtry=20
## + Fold1.Rep5: mtry=29
## - Fold1.Rep5: mtry=29
## + Fold1.Rep5: mtry=38
## - Fold1.Rep5: mtry=38
## + Fold2.Rep5: mtry= 2
## - Fold2.Rep5: mtry= 2
## + Fold2.Rep5: mtry=11
## - Fold2.Rep5: mtry=11
## + Fold2.Rep5: mtry=20
## - Fold2.Rep5: mtry=20
## + Fold2.Rep5: mtry=29
## - Fold2.Rep5: mtry=29
## + Fold2.Rep5: mtry=38
## - Fold2.Rep5: mtry=38
## + Fold3.Rep5: mtry= 2
## - Fold3.Rep5: mtry= 2
## + Fold3.Rep5: mtry=11
## - Fold3.Rep5: mtry=11
## + Fold3.Rep5: mtry=20
## - Fold3.Rep5: mtry=20
## + Fold3.Rep5: mtry=29
## - Fold3.Rep5: mtry=29
## + Fold3.Rep5: mtry=38
## - Fold3.Rep5: mtry=38
## + Fold4.Rep5: mtry= 2
## - Fold4.Rep5: mtry= 2
## + Fold4.Rep5: mtry=11
## - Fold4.Rep5: mtry=11
## + Fold4.Rep5: mtry=20
## - Fold4.Rep5: mtry=20
## + Fold4.Rep5: mtry=29
## - Fold4.Rep5: mtry=29
## + Fold4.Rep5: mtry=38
## - Fold4.Rep5: mtry=38
## + Fold5.Rep5: mtry= 2
## - Fold5.Rep5: mtry= 2
## + Fold5.Rep5: mtry=11
## - Fold5.Rep5: mtry=11
## + Fold5.Rep5: mtry=20
## - Fold5.Rep5: mtry=20
## + Fold5.Rep5: mtry=29
## - Fold5.Rep5: mtry=29
## + Fold5.Rep5: mtry=38
## - Fold5.Rep5: mtry=38
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 20 on full training set
# Predict the testing target
rf.predict <- predict(rf.fit, test[,-31])
confusionMatrix(rf.predict, test$Result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction -1 1
## -1 914 13
## 1 50 1234
##
## Accuracy : 0.9715
## 95% CI : (0.9637, 0.978)
## No Information Rate : 0.564
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9418
## Mcnemar's Test P-Value : 5.745e-06
##
## Sensitivity : 0.9481
## Specificity : 0.9896
## Pos Pred Value : 0.9860
## Neg Pred Value : 0.9611
## Prevalence : 0.4360
## Detection Rate : 0.4134
## Detection Prevalence : 0.4193
## Balanced Accuracy : 0.9689
##
## 'Positive' Class : -1
##
plot(varImp(rf.fit))

#logistic Regression
# trainControl for Boosted Logisitic Regression
fitControl <- trainControl(method = 'repeatedcv', repeats = 5,
number = 5, verboseIter = T)
# Run a Boosted logisitic regression over the training set
lg.fit <- train(Result ~ ., data = train,
method = "LogitBoost", trControl = fitControl,
tuneLength = 5)
## + Fold1.Rep1: nIter=51
## - Fold1.Rep1: nIter=51
## + Fold2.Rep1: nIter=51
## - Fold2.Rep1: nIter=51
## + Fold3.Rep1: nIter=51
## - Fold3.Rep1: nIter=51
## + Fold4.Rep1: nIter=51
## - Fold4.Rep1: nIter=51
## + Fold5.Rep1: nIter=51
## - Fold5.Rep1: nIter=51
## + Fold1.Rep2: nIter=51
## - Fold1.Rep2: nIter=51
## + Fold2.Rep2: nIter=51
## - Fold2.Rep2: nIter=51
## + Fold3.Rep2: nIter=51
## - Fold3.Rep2: nIter=51
## + Fold4.Rep2: nIter=51
## - Fold4.Rep2: nIter=51
## + Fold5.Rep2: nIter=51
## - Fold5.Rep2: nIter=51
## + Fold1.Rep3: nIter=51
## - Fold1.Rep3: nIter=51
## + Fold2.Rep3: nIter=51
## - Fold2.Rep3: nIter=51
## + Fold3.Rep3: nIter=51
## - Fold3.Rep3: nIter=51
## + Fold4.Rep3: nIter=51
## - Fold4.Rep3: nIter=51
## + Fold5.Rep3: nIter=51
## - Fold5.Rep3: nIter=51
## + Fold1.Rep4: nIter=51
## - Fold1.Rep4: nIter=51
## + Fold2.Rep4: nIter=51
## - Fold2.Rep4: nIter=51
## + Fold3.Rep4: nIter=51
## - Fold3.Rep4: nIter=51
## + Fold4.Rep4: nIter=51
## - Fold4.Rep4: nIter=51
## + Fold5.Rep4: nIter=51
## - Fold5.Rep4: nIter=51
## + Fold1.Rep5: nIter=51
## - Fold1.Rep5: nIter=51
## + Fold2.Rep5: nIter=51
## - Fold2.Rep5: nIter=51
## + Fold3.Rep5: nIter=51
## - Fold3.Rep5: nIter=51
## + Fold4.Rep5: nIter=51
## - Fold4.Rep5: nIter=51
## + Fold5.Rep5: nIter=51
## - Fold5.Rep5: nIter=51
## Aggregating results
## Selecting tuning parameters
## Fitting nIter = 41 on full training set
# Predict the testing target
log.predict <- predict(lg.fit, test[,-31])
confusionMatrix(log.predict,test$Result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction -1 1
## -1 866 56
## 1 98 1191
##
## Accuracy : 0.9303
## 95% CI : (0.9189, 0.9406)
## No Information Rate : 0.564
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8577
## Mcnemar's Test P-Value : 0.0009536
##
## Sensitivity : 0.8983
## Specificity : 0.9551
## Pos Pred Value : 0.9393
## Neg Pred Value : 0.9240
## Prevalence : 0.4360
## Detection Rate : 0.3917
## Detection Prevalence : 0.4170
## Balanced Accuracy : 0.9267
##
## 'Positive' Class : -1
##