library(readxl)
library(randomForest)
package 㤼㸱randomForest㤼㸲 was built under R version 3.4.3randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.
library(DMwR)
package 㤼㸱DMwR㤼㸲 was built under R version 3.4.3Loading required package: lattice
Loading required package: grid
carss <- read_excel("D:/PG Business Analytics/ML/Group Assignment/Cars_excel.xlsx")
head(carss)
data<-carss
carss$Transport<-as.numeric(as.factor(carss$Transport))
carss$Gender<-as.numeric(as.factor(carss$Gender))
summary(carss)
Age Gender Engineer MBA Work Exp
Min. :18.00 Min. :1.000 Min. :0.0000 Min. :0.0000 Min. : 0.0
1st Qu.:25.00 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.: 3.0
Median :27.00 Median :2.000 Median :1.0000 Median :0.0000 Median : 5.0
Mean :27.75 Mean :1.712 Mean :0.7545 Mean :0.2528 Mean : 6.3
3rd Qu.:30.00 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 8.0
Max. :43.00 Max. :2.000 Max. :1.0000 Max. :1.0000 Max. :24.0
NA's :1
Salary Distance license Transport
Min. : 6.50 Min. : 3.20 Min. :0.0000 Min. :1.000
1st Qu.: 9.80 1st Qu.: 8.80 1st Qu.:0.0000 1st Qu.:2.000
Median :13.60 Median :11.00 Median :0.0000 Median :3.000
Mean :16.24 Mean :11.32 Mean :0.2342 Mean :2.489
3rd Qu.:15.72 3rd Qu.:13.43 3rd Qu.:0.0000 3rd Qu.:3.000
Max. :57.00 Max. :23.40 Max. :1.0000 Max. :3.000
sapply(carss, function(y) sum(length(which(is.na(y)))))
Age Gender Engineer MBA Work Exp Salary Distance license Transport
0 0 0 1 0 0 0 0 0
carss[is.na(carss$MBA),4]<-0
sapply(carss, function(y) sum(length(which(is.na(y)))))
Age Gender Engineer MBA Work Exp Salary Distance license Transport
0 0 0 0 0 0 0 0 0
names(carss)[5]<-'Work_Exp'
names(carss)
[1] "Age" "Gender" "Engineer" "MBA" "Work_Exp" "Salary" "Distance"
[8] "license" "Transport"
nrow(carss[carss$Transport=='1',])
[1] 83
nrow(carss[carss$Transport=='2',])
[1] 61
nrow(carss[carss$Transport=='3',])
[1] 300
cars2<-rbind(carss,carss,carss)
summary(lm(Transport~.,data=carss))
Call:
lm(formula = Transport ~ ., data = carss)
Residuals:
Min 1Q Median 3Q Max
-2.1084 -0.2521 0.2077 0.4617 1.1778
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.530873 0.512355 1.036 0.301
Age 0.087329 0.021402 4.080 5.35e-05 ***
Gender 0.365830 0.076868 4.759 2.65e-06 ***
Engineer -0.012362 0.078716 -0.157 0.875
MBA 0.128405 0.078676 1.632 0.103
Work_Exp -0.039437 0.026114 -1.510 0.132
Salary -0.008323 0.009589 -0.868 0.386
Distance -0.052932 0.010538 -5.023 7.44e-07 ***
license -0.561885 0.095513 -5.883 8.05e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7083 on 435 degrees of freedom
Multiple R-squared: 0.2119, Adjusted R-squared: 0.1974
F-statistic: 14.62 on 8 and 435 DF, p-value: < 2.2e-16
step(lm(Transport~.,data=carss))
Start: AIC=-297.32
Transport ~ Age + Gender + Engineer + MBA + Work_Exp + Salary +
Distance + license
Df Sum of Sq RSS AIC
- Engineer 1 0.0124 218.26 -299.30
- Salary 1 0.3780 218.63 -298.56
<none> 218.25 -297.32
- Work_Exp 1 1.1443 219.39 -297.00
- MBA 1 1.3364 219.59 -296.61
- Age 1 8.3531 226.60 -282.65
- Gender 1 11.3640 229.61 -276.79
- Distance 1 12.6585 230.91 -274.29
- license 1 17.3634 235.61 -265.34
Step: AIC=-299.3
Transport ~ Age + Gender + MBA + Work_Exp + Salary + Distance +
license
Df Sum of Sq RSS AIC
- Salary 1 0.3817 218.64 -300.52
<none> 218.26 -299.30
- Work_Exp 1 1.1394 219.40 -298.99
- MBA 1 1.3251 219.59 -298.61
- Age 1 8.3410 226.60 -284.65
- Gender 1 11.3586 229.62 -278.77
- Distance 1 12.6833 230.94 -276.22
- license 1 17.3520 235.61 -267.33
Step: AIC=-300.52
Transport ~ Age + Gender + MBA + Work_Exp + Distance + license
Df Sum of Sq RSS AIC
<none> 218.64 -300.52
- MBA 1 1.4093 220.05 -299.67
- Work_Exp 1 4.4954 223.14 -293.49
- Age 1 8.7858 227.43 -285.03
- Gender 1 11.3760 230.02 -280.00
- Distance 1 14.8403 233.48 -273.37
- license 1 19.7272 238.37 -264.17
Call:
lm(formula = Transport ~ Age + Gender + MBA + Work_Exp + Distance +
license, data = carss)
Coefficients:
(Intercept) Age Gender MBA Work_Exp Distance license
0.47052 0.08903 0.36601 0.13136 -0.05540 -0.05535 -0.58125
summary(glm(Transport~.,data=carss,family = "poisson"))
Call:
glm(formula = Transport ~ ., family = "poisson", data = carss)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4298 -0.1574 0.1234 0.2926 0.7538
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.116992 0.466795 0.251 0.80210
Age 0.035387 0.019380 1.826 0.06786 .
Gender 0.147952 0.070002 2.114 0.03456 *
Engineer -0.004134 0.070342 -0.059 0.95313
MBA 0.051104 0.069464 0.736 0.46192
Work_Exp -0.016168 0.023411 -0.691 0.48983
Salary -0.003543 0.008775 -0.404 0.68641
Distance -0.021827 0.009586 -2.277 0.02280 *
license -0.233686 0.089643 -2.607 0.00914 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 131.60 on 443 degrees of freedom
Residual deviance: 107.57 on 435 degrees of freedom
AIC: 1348.6
Number of Fisher Scoring iterations: 4
rf<-randomForest(as.factor(Transport)~.,data=carss, replace=TRUE,mtry=15,ntree=101)
invalid mtry: reset to within valid range
rf
Call:
randomForest(formula = as.factor(Transport) ~ ., data = carss, replace = TRUE, mtry = 15, ntree = 101)
Type of random forest: classification
Number of trees: 101
No. of variables tried at each split: 8
OOB estimate of error rate: 18.02%
Confusion matrix:
1 2 3 class.error
1 42 2 39 0.49397590
2 2 51 8 0.16393443
3 24 5 271 0.09666667
rf2<-randomForest(as.factor(Transport)~.,data=cars2, replace=TRUE,mtry=15,ntree=101)
invalid mtry: reset to within valid range
rf2
Call:
randomForest(formula = as.factor(Transport) ~ ., data = cars2, replace = TRUE, mtry = 15, ntree = 101)
Type of random forest: classification
Number of trees: 101
No. of variables tried at each split: 8
OOB estimate of error rate: 0%
Confusion matrix:
1 2 3 class.error
1 249 0 0 0
2 0 183 0 0
3 0 0 900 0
Can you get a better accuracy than this?? :P
knnc<-knn(train=carss[,-9],test=carss[,-9],cl=carss$Transport)
Error in knn(train = carss[, -9], test = carss[, -9], cl = carss$Transport) :
could not find function "knn"
100% accuracy again
index<-as.vector(createDataPartition(carss$Transport,p=.9))
train<-carss[index$Resample1,]
test<-carss[-index$Resample1,]
knnc<-knn(train,test,cl=train$Transport)
table(actual=test$Transport,pred=knnc)
pred
actual 1 2 3
1 10 0 0
2 0 3 1
3 2 0 28
86% on divinding the data into test and train using KNN with 90-10 partining.
predrf<-predict(rf2,test)
data.frame(predrf)
nrow(data[data$tran=='1',])
[1] 61
summary(lm(tran~.,data[,-c(2,9)]))
Call:
lm(formula = tran ~ ., data = data[, -c(2, 9)])
Residuals:
Min 1Q Median 3Q Max
-0.63884 -0.11178 -0.01968 0.07366 0.84096
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.827563 0.148428 -5.576 4.34e-08 ***
Age 0.021849 0.006329 3.453 0.000610 ***
Engineer 0.019225 0.023349 0.823 0.410760
MBA -0.029011 0.023261 -1.247 0.213013
`Work Exp` -0.004513 0.007720 -0.585 0.559173
Salary 0.014735 0.002835 5.198 3.11e-07 ***
Distance 0.011620 0.003116 3.729 0.000217 ***
license 0.138539 0.028238 4.906 1.32e-06 ***
Gen -0.032647 0.022778 -1.433 0.152496
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2094 on 434 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.6382, Adjusted R-squared: 0.6315
F-statistic: 95.7 on 8 and 434 DF, p-value: < 2.2e-16
d<-rbind(data[,-c(2,9)],data[data$tran=="1",-c(2,9)])
summary(lm(tran~.,data=d))
Call:
lm(formula = tran ~ ., data = d)
Residuals:
Min 1Q Median 3Q Max
-0.68871 -0.14735 -0.02892 0.11058 0.80580
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.119865 0.161671 -6.927 1.34e-11 ***
Age 0.035378 0.006886 5.138 4.01e-07 ***
Engineer 0.025860 0.025485 1.015 0.31073
MBA -0.028943 0.025168 -1.150 0.25072
`Work Exp` -0.001320 0.008184 -0.161 0.87193
Salary 0.008869 0.002758 3.215 0.00139 **
Distance 0.013880 0.003306 4.198 3.19e-05 ***
license 0.175674 0.029983 5.859 8.49e-09 ***
Gen -0.054273 0.024703 -2.197 0.02848 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2387 on 495 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.695, Adjusted R-squared: 0.6901
F-statistic: 141 on 8 and 495 DF, p-value: < 2.2e-16
summary(glm(tran~.,data=d,family="binomial"))
Call:
glm(formula = tran ~ ., family = "binomial", data = d)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.37729 -0.03576 -0.00387 -0.00001 1.83674
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -74.65735 13.59641 -5.491 4.00e-08 ***
Age 2.40321 0.45381 5.296 1.19e-07 ***
Engineer 0.89974 0.76163 1.181 0.237466
MBA -1.78397 0.74263 -2.402 0.016296 *
`Work Exp` -1.26473 0.30613 -4.131 3.61e-05 ***
Salary 0.19912 0.06084 3.273 0.001064 **
Distance 0.51229 0.12527 4.090 4.32e-05 ***
license 2.66464 0.73612 3.620 0.000295 ***
Gen -1.96273 0.73155 -2.683 0.007297 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 557.874 on 503 degrees of freedom
Residual deviance: 88.333 on 495 degrees of freedom
(1 observation deleted due to missingness)
AIC: 106.33
Number of Fisher Scoring iterations: 9
LS0tDQp0aXRsZTogIk1MIGV4Y2VyY2lzZSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiANCg0KYGBge3J9DQpsaWJyYXJ5KHJlYWR4bCkNCmxpYnJhcnkocmFuZG9tRm9yZXN0KQ0KbGlicmFyeShETXdSKQ0KYGBgDQoNCg0KYGBge3J9DQpjYXJzcyA8LSByZWFkX2V4Y2VsKCJEOi9QRyBCdXNpbmVzcyBBbmFseXRpY3MvTUwvR3JvdXAgQXNzaWdubWVudC9DYXJzX2V4Y2VsLnhsc3giKQ0KaGVhZChjYXJzcykNCmBgYA0KDQpgYGB7cn0NCmRhdGE8LWNhcnNzDQpjYXJzcyRUcmFuc3BvcnQ8LWFzLm51bWVyaWMoYXMuZmFjdG9yKGNhcnNzJFRyYW5zcG9ydCkpDQpjYXJzcyRHZW5kZXI8LWFzLm51bWVyaWMoYXMuZmFjdG9yKGNhcnNzJEdlbmRlcikpDQpzdW1tYXJ5KGNhcnNzKQ0KYGBgDQoNCmBgYHtyfQ0Kc2FwcGx5KGNhcnNzLCBmdW5jdGlvbih5KSBzdW0obGVuZ3RoKHdoaWNoKGlzLm5hKHkpKSkpKQ0KYGBgDQoNCmBgYHtyfQ0KY2Fyc3NbaXMubmEoY2Fyc3MkTUJBKSw0XTwtMA0Kc2FwcGx5KGNhcnNzLCBmdW5jdGlvbih5KSBzdW0obGVuZ3RoKHdoaWNoKGlzLm5hKHkpKSkpKQ0KYGBgDQoNCmBgYHtyfQ0KbmFtZXMoY2Fyc3MpWzVdPC0nV29ya19FeHAnDQpuYW1lcyhjYXJzcykNCmBgYA0KDQoNCmBgYHtyfQ0KbnJvdyhjYXJzc1tjYXJzcyRUcmFuc3BvcnQ9PScxJyxdKQ0KbnJvdyhjYXJzc1tjYXJzcyRUcmFuc3BvcnQ9PScyJyxdKQ0KbnJvdyhjYXJzc1tjYXJzcyRUcmFuc3BvcnQ9PSczJyxdKQ0KYGBgDQoNCmBgYHtyfQ0KY2FyczI8LXJiaW5kKGNhcnNzLGNhcnNzLGNhcnNzKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeShsbShUcmFuc3BvcnR+LixkYXRhPWNhcnNzKSkNCmBgYA0KDQpgYGB7cn0NCnN0ZXAobG0oVHJhbnNwb3J0fi4sZGF0YT1jYXJzcykpDQpgYGANCg0KDQoNCmBgYHtyfQ0Kc3VtbWFyeShnbG0oVHJhbnNwb3J0fi4sZGF0YT1jYXJzcyxmYW1pbHkgPSAicG9pc3NvbiIpKQ0KYGBgDQoNCmBgYHtyfQ0KcmY8LXJhbmRvbUZvcmVzdChhcy5mYWN0b3IoVHJhbnNwb3J0KX4uLGRhdGE9Y2Fyc3MsIHJlcGxhY2U9VFJVRSxtdHJ5PTE1LG50cmVlPTEwMSkNCnJmDQpgYGANCg0KDQpgYGB7cn0NCnJmMjwtcmFuZG9tRm9yZXN0KGFzLmZhY3RvcihUcmFuc3BvcnQpfi4sZGF0YT1jYXJzMiwgcmVwbGFjZT1UUlVFLG10cnk9MTUsbnRyZWU9MTAxKQ0KcmYyDQpgYGANCg0KPGgzPkNhbiB5b3UgZ2V0IGEgYmV0dGVyIGFjY3VyYWN5IHRoYW4gdGhpcz8/IA0KIDpQPC9oMz4NCg0KDQpgYGB7cn0NCmtubmM8LWtubih0cmFpbj1jYXJzc1ssLTldLHRlc3Q9Y2Fyc3NbLC05XSxjbD1jYXJzcyRUcmFuc3BvcnQpDQp0YWJsZShhY3R1YWw9Y2Fyc3MkVHJhbnNwb3J0LHByZWQ9a25uYykNCmBgYA0KDQo8aDI+MTAwJSBhY2N1cmFjeSBhZ2FpbjwvaDI+DQoNCmBgYHtyfQ0KaW5kZXg8LWFzLnZlY3RvcihjcmVhdGVEYXRhUGFydGl0aW9uKGNhcnNzJFRyYW5zcG9ydCxwPS45KSkNCnRyYWluPC1jYXJzc1tpbmRleCRSZXNhbXBsZTEsXQ0KdGVzdDwtY2Fyc3NbLWluZGV4JFJlc2FtcGxlMSxdDQprbm5jPC1rbm4odHJhaW4sdGVzdCxjbD10cmFpbiRUcmFuc3BvcnQpDQp0YWJsZShhY3R1YWw9dGVzdCRUcmFuc3BvcnQscHJlZD1rbm5jKQ0KYGBgDQoNCg0KPGgyPjg2JSBvbiBkaXZpbmRpbmcgdGhlIGRhdGEgaW50byB0ZXN0IGFuZCB0cmFpbiB1c2luZyBLTk4gd2l0aCA5MC0xMCBwYXJ0aW5pbmcuPC9oMj4NCg0KYGBge3J9DQpjMCA8LSBjKDI1LDI1KQ0KYzEgPC0gYygxLDApDQpjMiA8LSBjKDAsMSkNCmMzIDwtIGMoMCwwKQ0KYzQgPC0gYygyLDIpDQpjNSA8LSBjKDEwLDEwKQ0KYzYgPC0gYyg1LDUpDQpjNyA8LSBjKDEsMCkNCg0KdGVzdCA9IGRhdGEuZnJhbWUoYzAsYzEsYzIsYzMsYzQsYzUsYzYsYzcpDQpjb2xuYW1lcyh0ZXN0KTwtY29sbmFtZXMoY2Fyc3MpW2MoMTo4KV0NCnRlc3QNCmBgYA0KDQoNCmBgYHtyfQ0KcHJlZHJmPC1wcmVkaWN0KHJmMix0ZXN0KQ0KZGF0YS5mcmFtZShwcmVkcmYpDQpgYGANCg0KDQoNCmBgYHtyfQ0KZGF0YSR0cmFuPC1hcy5udW1lcmljKGRhdGEkVHJhbnNwb3J0PT0iQ2FyIikNCmRhdGEkR2VuPC1hcy5udW1lcmljKGRhdGEkR2VuZGVyPT0iTWFsZSIpDQpucm93KGRhdGFbZGF0YSR0cmFuPT0nMCcsXSkNCm5yb3coZGF0YVtkYXRhJHRyYW49PScxJyxdKQ0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KGxtKHRyYW5+LixkYXRhWywtYygyLDkpXSkpDQpgYGANCg0KYGBge3J9DQpkPC1yYmluZChkYXRhWywtYygyLDkpXSxkYXRhW2RhdGEkdHJhbj09IjEiLC1jKDIsOSldKQ0Kc3VtbWFyeShsbSh0cmFufi4sZGF0YT1kKSkNCg0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KGdsbSh0cmFufi4sZGF0YT1kLGZhbWlseT0iYmlub21pYWwiKSkNCmBgYA0K