library(readxl)
library(randomForest)
package 㤼㸱randomForest㤼㸲 was built under R version 3.4.3randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.
library(DMwR)
package 㤼㸱DMwR㤼㸲 was built under R version 3.4.3Loading required package: lattice
Loading required package: grid
carss <- read_excel("D:/PG Business Analytics/ML/Group Assignment/Cars_excel.xlsx")
head(carss)
data<-carss
carss$Transport<-as.numeric(as.factor(carss$Transport))
carss$Gender<-as.numeric(as.factor(carss$Gender))
summary(carss)
      Age            Gender         Engineer           MBA            Work Exp   
 Min.   :18.00   Min.   :1.000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.0  
 1st Qu.:25.00   1st Qu.:1.000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.: 3.0  
 Median :27.00   Median :2.000   Median :1.0000   Median :0.0000   Median : 5.0  
 Mean   :27.75   Mean   :1.712   Mean   :0.7545   Mean   :0.2528   Mean   : 6.3  
 3rd Qu.:30.00   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 8.0  
 Max.   :43.00   Max.   :2.000   Max.   :1.0000   Max.   :1.0000   Max.   :24.0  
                                                  NA's   :1                      
     Salary         Distance        license         Transport    
 Min.   : 6.50   Min.   : 3.20   Min.   :0.0000   Min.   :1.000  
 1st Qu.: 9.80   1st Qu.: 8.80   1st Qu.:0.0000   1st Qu.:2.000  
 Median :13.60   Median :11.00   Median :0.0000   Median :3.000  
 Mean   :16.24   Mean   :11.32   Mean   :0.2342   Mean   :2.489  
 3rd Qu.:15.72   3rd Qu.:13.43   3rd Qu.:0.0000   3rd Qu.:3.000  
 Max.   :57.00   Max.   :23.40   Max.   :1.0000   Max.   :3.000  
                                                                 
sapply(carss, function(y) sum(length(which(is.na(y)))))
      Age    Gender  Engineer       MBA  Work Exp    Salary  Distance   license Transport 
        0         0         0         1         0         0         0         0         0 
carss[is.na(carss$MBA),4]<-0
sapply(carss, function(y) sum(length(which(is.na(y)))))
      Age    Gender  Engineer       MBA  Work Exp    Salary  Distance   license Transport 
        0         0         0         0         0         0         0         0         0 
names(carss)[5]<-'Work_Exp'
names(carss)
[1] "Age"       "Gender"    "Engineer"  "MBA"       "Work_Exp"  "Salary"    "Distance" 
[8] "license"   "Transport"
nrow(carss[carss$Transport=='1',])
[1] 83
nrow(carss[carss$Transport=='2',])
[1] 61
nrow(carss[carss$Transport=='3',])
[1] 300
cars2<-rbind(carss,carss,carss)
summary(lm(Transport~.,data=carss))

Call:
lm(formula = Transport ~ ., data = carss)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.1084 -0.2521  0.2077  0.4617  1.1778 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.530873   0.512355   1.036    0.301    
Age          0.087329   0.021402   4.080 5.35e-05 ***
Gender       0.365830   0.076868   4.759 2.65e-06 ***
Engineer    -0.012362   0.078716  -0.157    0.875    
MBA          0.128405   0.078676   1.632    0.103    
Work_Exp    -0.039437   0.026114  -1.510    0.132    
Salary      -0.008323   0.009589  -0.868    0.386    
Distance    -0.052932   0.010538  -5.023 7.44e-07 ***
license     -0.561885   0.095513  -5.883 8.05e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.7083 on 435 degrees of freedom
Multiple R-squared:  0.2119,    Adjusted R-squared:  0.1974 
F-statistic: 14.62 on 8 and 435 DF,  p-value: < 2.2e-16
step(lm(Transport~.,data=carss))
Start:  AIC=-297.32
Transport ~ Age + Gender + Engineer + MBA + Work_Exp + Salary + 
    Distance + license

           Df Sum of Sq    RSS     AIC
- Engineer  1    0.0124 218.26 -299.30
- Salary    1    0.3780 218.63 -298.56
<none>                  218.25 -297.32
- Work_Exp  1    1.1443 219.39 -297.00
- MBA       1    1.3364 219.59 -296.61
- Age       1    8.3531 226.60 -282.65
- Gender    1   11.3640 229.61 -276.79
- Distance  1   12.6585 230.91 -274.29
- license   1   17.3634 235.61 -265.34

Step:  AIC=-299.3
Transport ~ Age + Gender + MBA + Work_Exp + Salary + Distance + 
    license

           Df Sum of Sq    RSS     AIC
- Salary    1    0.3817 218.64 -300.52
<none>                  218.26 -299.30
- Work_Exp  1    1.1394 219.40 -298.99
- MBA       1    1.3251 219.59 -298.61
- Age       1    8.3410 226.60 -284.65
- Gender    1   11.3586 229.62 -278.77
- Distance  1   12.6833 230.94 -276.22
- license   1   17.3520 235.61 -267.33

Step:  AIC=-300.52
Transport ~ Age + Gender + MBA + Work_Exp + Distance + license

           Df Sum of Sq    RSS     AIC
<none>                  218.64 -300.52
- MBA       1    1.4093 220.05 -299.67
- Work_Exp  1    4.4954 223.14 -293.49
- Age       1    8.7858 227.43 -285.03
- Gender    1   11.3760 230.02 -280.00
- Distance  1   14.8403 233.48 -273.37
- license   1   19.7272 238.37 -264.17

Call:
lm(formula = Transport ~ Age + Gender + MBA + Work_Exp + Distance + 
    license, data = carss)

Coefficients:
(Intercept)          Age       Gender          MBA     Work_Exp     Distance      license  
    0.47052      0.08903      0.36601      0.13136     -0.05540     -0.05535     -0.58125  
summary(glm(Transport~.,data=carss,family = "poisson"))

Call:
glm(formula = Transport ~ ., family = "poisson", data = carss)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4298  -0.1574   0.1234   0.2926   0.7538  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.116992   0.466795   0.251  0.80210   
Age          0.035387   0.019380   1.826  0.06786 . 
Gender       0.147952   0.070002   2.114  0.03456 * 
Engineer    -0.004134   0.070342  -0.059  0.95313   
MBA          0.051104   0.069464   0.736  0.46192   
Work_Exp    -0.016168   0.023411  -0.691  0.48983   
Salary      -0.003543   0.008775  -0.404  0.68641   
Distance    -0.021827   0.009586  -2.277  0.02280 * 
license     -0.233686   0.089643  -2.607  0.00914 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 131.60  on 443  degrees of freedom
Residual deviance: 107.57  on 435  degrees of freedom
AIC: 1348.6

Number of Fisher Scoring iterations: 4
rf<-randomForest(as.factor(Transport)~.,data=carss, replace=TRUE,mtry=15,ntree=101)
invalid mtry: reset to within valid range
rf

Call:
 randomForest(formula = as.factor(Transport) ~ ., data = carss,      replace = TRUE, mtry = 15, ntree = 101) 
               Type of random forest: classification
                     Number of trees: 101
No. of variables tried at each split: 8

        OOB estimate of  error rate: 18.02%
Confusion matrix:
   1  2   3 class.error
1 42  2  39  0.49397590
2  2 51   8  0.16393443
3 24  5 271  0.09666667
rf2<-randomForest(as.factor(Transport)~.,data=cars2, replace=TRUE,mtry=15,ntree=101)
invalid mtry: reset to within valid range
rf2

Call:
 randomForest(formula = as.factor(Transport) ~ ., data = cars2,      replace = TRUE, mtry = 15, ntree = 101) 
               Type of random forest: classification
                     Number of trees: 101
No. of variables tried at each split: 8

        OOB estimate of  error rate: 0%
Confusion matrix:
    1   2   3 class.error
1 249   0   0           0
2   0 183   0           0
3   0   0 900           0

Can you get a better accuracy than this?? :P

knnc<-knn(train=carss[,-9],test=carss[,-9],cl=carss$Transport)
Error in knn(train = carss[, -9], test = carss[, -9], cl = carss$Transport) : 
  could not find function "knn"

100% accuracy again

index<-as.vector(createDataPartition(carss$Transport,p=.9))
train<-carss[index$Resample1,]
test<-carss[-index$Resample1,]
knnc<-knn(train,test,cl=train$Transport)
table(actual=test$Transport,pred=knnc)
      pred
actual  1  2  3
     1 10  0  0
     2  0  3  1
     3  2  0 28

86% on divinding the data into test and train using KNN with 90-10 partining.

predrf<-predict(rf2,test)
data.frame(predrf)
nrow(data[data$tran=='1',])
[1] 61
summary(lm(tran~.,data[,-c(2,9)]))

Call:
lm(formula = tran ~ ., data = data[, -c(2, 9)])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.63884 -0.11178 -0.01968  0.07366  0.84096 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.827563   0.148428  -5.576 4.34e-08 ***
Age          0.021849   0.006329   3.453 0.000610 ***
Engineer     0.019225   0.023349   0.823 0.410760    
MBA         -0.029011   0.023261  -1.247 0.213013    
`Work Exp`  -0.004513   0.007720  -0.585 0.559173    
Salary       0.014735   0.002835   5.198 3.11e-07 ***
Distance     0.011620   0.003116   3.729 0.000217 ***
license      0.138539   0.028238   4.906 1.32e-06 ***
Gen         -0.032647   0.022778  -1.433 0.152496    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2094 on 434 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.6382,    Adjusted R-squared:  0.6315 
F-statistic:  95.7 on 8 and 434 DF,  p-value: < 2.2e-16
d<-rbind(data[,-c(2,9)],data[data$tran=="1",-c(2,9)])
summary(lm(tran~.,data=d))

Call:
lm(formula = tran ~ ., data = d)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.68871 -0.14735 -0.02892  0.11058  0.80580 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.119865   0.161671  -6.927 1.34e-11 ***
Age          0.035378   0.006886   5.138 4.01e-07 ***
Engineer     0.025860   0.025485   1.015  0.31073    
MBA         -0.028943   0.025168  -1.150  0.25072    
`Work Exp`  -0.001320   0.008184  -0.161  0.87193    
Salary       0.008869   0.002758   3.215  0.00139 ** 
Distance     0.013880   0.003306   4.198 3.19e-05 ***
license      0.175674   0.029983   5.859 8.49e-09 ***
Gen         -0.054273   0.024703  -2.197  0.02848 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2387 on 495 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.695, Adjusted R-squared:  0.6901 
F-statistic:   141 on 8 and 495 DF,  p-value: < 2.2e-16
summary(glm(tran~.,data=d,family="binomial"))

Call:
glm(formula = tran ~ ., family = "binomial", data = d)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.37729  -0.03576  -0.00387  -0.00001   1.83674  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -74.65735   13.59641  -5.491 4.00e-08 ***
Age           2.40321    0.45381   5.296 1.19e-07 ***
Engineer      0.89974    0.76163   1.181 0.237466    
MBA          -1.78397    0.74263  -2.402 0.016296 *  
`Work Exp`   -1.26473    0.30613  -4.131 3.61e-05 ***
Salary        0.19912    0.06084   3.273 0.001064 ** 
Distance      0.51229    0.12527   4.090 4.32e-05 ***
license       2.66464    0.73612   3.620 0.000295 ***
Gen          -1.96273    0.73155  -2.683 0.007297 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 557.874  on 503  degrees of freedom
Residual deviance:  88.333  on 495  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 106.33

Number of Fisher Scoring iterations: 9
LS0tDQp0aXRsZTogIk1MIGV4Y2VyY2lzZSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiANCg0KYGBge3J9DQpsaWJyYXJ5KHJlYWR4bCkNCmxpYnJhcnkocmFuZG9tRm9yZXN0KQ0KbGlicmFyeShETXdSKQ0KYGBgDQoNCg0KYGBge3J9DQpjYXJzcyA8LSByZWFkX2V4Y2VsKCJEOi9QRyBCdXNpbmVzcyBBbmFseXRpY3MvTUwvR3JvdXAgQXNzaWdubWVudC9DYXJzX2V4Y2VsLnhsc3giKQ0KaGVhZChjYXJzcykNCmBgYA0KDQpgYGB7cn0NCmRhdGE8LWNhcnNzDQpjYXJzcyRUcmFuc3BvcnQ8LWFzLm51bWVyaWMoYXMuZmFjdG9yKGNhcnNzJFRyYW5zcG9ydCkpDQpjYXJzcyRHZW5kZXI8LWFzLm51bWVyaWMoYXMuZmFjdG9yKGNhcnNzJEdlbmRlcikpDQpzdW1tYXJ5KGNhcnNzKQ0KYGBgDQoNCmBgYHtyfQ0Kc2FwcGx5KGNhcnNzLCBmdW5jdGlvbih5KSBzdW0obGVuZ3RoKHdoaWNoKGlzLm5hKHkpKSkpKQ0KYGBgDQoNCmBgYHtyfQ0KY2Fyc3NbaXMubmEoY2Fyc3MkTUJBKSw0XTwtMA0Kc2FwcGx5KGNhcnNzLCBmdW5jdGlvbih5KSBzdW0obGVuZ3RoKHdoaWNoKGlzLm5hKHkpKSkpKQ0KYGBgDQoNCmBgYHtyfQ0KbmFtZXMoY2Fyc3MpWzVdPC0nV29ya19FeHAnDQpuYW1lcyhjYXJzcykNCmBgYA0KDQoNCmBgYHtyfQ0KbnJvdyhjYXJzc1tjYXJzcyRUcmFuc3BvcnQ9PScxJyxdKQ0KbnJvdyhjYXJzc1tjYXJzcyRUcmFuc3BvcnQ9PScyJyxdKQ0KbnJvdyhjYXJzc1tjYXJzcyRUcmFuc3BvcnQ9PSczJyxdKQ0KYGBgDQoNCmBgYHtyfQ0KY2FyczI8LXJiaW5kKGNhcnNzLGNhcnNzLGNhcnNzKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeShsbShUcmFuc3BvcnR+LixkYXRhPWNhcnNzKSkNCmBgYA0KDQpgYGB7cn0NCnN0ZXAobG0oVHJhbnNwb3J0fi4sZGF0YT1jYXJzcykpDQpgYGANCg0KDQoNCmBgYHtyfQ0Kc3VtbWFyeShnbG0oVHJhbnNwb3J0fi4sZGF0YT1jYXJzcyxmYW1pbHkgPSAicG9pc3NvbiIpKQ0KYGBgDQoNCmBgYHtyfQ0KcmY8LXJhbmRvbUZvcmVzdChhcy5mYWN0b3IoVHJhbnNwb3J0KX4uLGRhdGE9Y2Fyc3MsIHJlcGxhY2U9VFJVRSxtdHJ5PTE1LG50cmVlPTEwMSkNCnJmDQpgYGANCg0KDQpgYGB7cn0NCnJmMjwtcmFuZG9tRm9yZXN0KGFzLmZhY3RvcihUcmFuc3BvcnQpfi4sZGF0YT1jYXJzMiwgcmVwbGFjZT1UUlVFLG10cnk9MTUsbnRyZWU9MTAxKQ0KcmYyDQpgYGANCg0KPGgzPkNhbiB5b3UgZ2V0IGEgYmV0dGVyIGFjY3VyYWN5IHRoYW4gdGhpcz8/IA0KIDpQPC9oMz4NCg0KDQpgYGB7cn0NCmtubmM8LWtubih0cmFpbj1jYXJzc1ssLTldLHRlc3Q9Y2Fyc3NbLC05XSxjbD1jYXJzcyRUcmFuc3BvcnQpDQp0YWJsZShhY3R1YWw9Y2Fyc3MkVHJhbnNwb3J0LHByZWQ9a25uYykNCmBgYA0KDQo8aDI+MTAwJSBhY2N1cmFjeSBhZ2FpbjwvaDI+DQoNCmBgYHtyfQ0KaW5kZXg8LWFzLnZlY3RvcihjcmVhdGVEYXRhUGFydGl0aW9uKGNhcnNzJFRyYW5zcG9ydCxwPS45KSkNCnRyYWluPC1jYXJzc1tpbmRleCRSZXNhbXBsZTEsXQ0KdGVzdDwtY2Fyc3NbLWluZGV4JFJlc2FtcGxlMSxdDQprbm5jPC1rbm4odHJhaW4sdGVzdCxjbD10cmFpbiRUcmFuc3BvcnQpDQp0YWJsZShhY3R1YWw9dGVzdCRUcmFuc3BvcnQscHJlZD1rbm5jKQ0KYGBgDQoNCg0KPGgyPjg2JSBvbiBkaXZpbmRpbmcgdGhlIGRhdGEgaW50byB0ZXN0IGFuZCB0cmFpbiB1c2luZyBLTk4gd2l0aCA5MC0xMCBwYXJ0aW5pbmcuPC9oMj4NCg0KYGBge3J9DQpjMCA8LSBjKDI1LDI1KQ0KYzEgPC0gYygxLDApDQpjMiA8LSBjKDAsMSkNCmMzIDwtIGMoMCwwKQ0KYzQgPC0gYygyLDIpDQpjNSA8LSBjKDEwLDEwKQ0KYzYgPC0gYyg1LDUpDQpjNyA8LSBjKDEsMCkNCg0KdGVzdCA9IGRhdGEuZnJhbWUoYzAsYzEsYzIsYzMsYzQsYzUsYzYsYzcpDQpjb2xuYW1lcyh0ZXN0KTwtY29sbmFtZXMoY2Fyc3MpW2MoMTo4KV0NCnRlc3QNCmBgYA0KDQoNCmBgYHtyfQ0KcHJlZHJmPC1wcmVkaWN0KHJmMix0ZXN0KQ0KZGF0YS5mcmFtZShwcmVkcmYpDQpgYGANCg0KDQoNCmBgYHtyfQ0KZGF0YSR0cmFuPC1hcy5udW1lcmljKGRhdGEkVHJhbnNwb3J0PT0iQ2FyIikNCmRhdGEkR2VuPC1hcy5udW1lcmljKGRhdGEkR2VuZGVyPT0iTWFsZSIpDQpucm93KGRhdGFbZGF0YSR0cmFuPT0nMCcsXSkNCm5yb3coZGF0YVtkYXRhJHRyYW49PScxJyxdKQ0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KGxtKHRyYW5+LixkYXRhWywtYygyLDkpXSkpDQpgYGANCg0KYGBge3J9DQpkPC1yYmluZChkYXRhWywtYygyLDkpXSxkYXRhW2RhdGEkdHJhbj09IjEiLC1jKDIsOSldKQ0Kc3VtbWFyeShsbSh0cmFufi4sZGF0YT1kKSkNCg0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KGdsbSh0cmFufi4sZGF0YT1kLGZhbWlseT0iYmlub21pYWwiKSkNCmBgYA0K