To predict the salary of the person is less than 50 K or greater than

50k using Naive Bayes Model

# Libraries
library(naivebayes)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
library(psych)
## Warning: package 'psych' was built under R version 3.5.1
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
# Data(Train)
train_sal <- read.csv(file.choose())
str(train_sal)
## 'data.frame':    30161 obs. of  14 variables:
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass    : Factor w/ 7 levels " Federal-gov",..: 6 5 3 3 3 3 3 5 3 3 ...
##  $ education    : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ educationno  : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ maritalstatus: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation   : Factor w/ 14 levels " Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race         : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex          : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capitalgain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capitalloss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hoursperweek : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native       : Factor w/ 40 levels " Cambodia"," Canada",..: 38 38 38 38 5 38 22 38 38 38 ...
##  $ Salary       : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
View(train_sal)
train_sal$educationno <- as.factor(train_sal$educationno)
class(train_sal)
## [1] "data.frame"
# Data(Test)
test_sal <- read.csv(file.choose())
str(test_sal)
## 'data.frame':    15060 obs. of  14 variables:
##  $ age          : int  25 38 28 44 34 63 24 55 65 36 ...
##  $ workclass    : Factor w/ 7 levels " Federal-gov",..: 3 3 2 3 3 5 3 3 3 1 ...
##  $ education    : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 1 15 16 6 12 10 ...
##  $ educationno  : int  7 9 12 10 6 15 10 4 9 13 ...
##  $ maritalstatus: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 3 5 3 3 3 ...
##  $ occupation   : Factor w/ 14 levels " Adm-clerical",..: 7 5 11 7 8 10 8 3 7 1 ...
##  $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 2 1 5 1 1 1 ...
##  $ race         : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 5 5 5 5 ...
##  $ sex          : Factor w/ 2 levels " Female"," Male": 2 2 2 2 2 2 1 2 2 2 ...
##  $ capitalgain  : int  0 0 0 7688 0 3103 0 0 6418 0 ...
##  $ capitalloss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hoursperweek : int  40 50 40 40 30 32 40 10 40 40 ...
##  $ native       : Factor w/ 40 levels " Cambodia"," Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
##  $ Salary       : Factor w/ 2 levels " <=50K"," >50K": 1 1 2 2 1 2 1 1 2 1 ...
View(test_sal)
test_sal$educationno <- as.factor(test_sal$educationno)
class(test_sal)
## [1] "data.frame"
#Visualization 
# Plot and ggplot 
ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$age, fill = train_sal$Salary)) +
  geom_boxplot() +
  ggtitle("Box Plot")

plot(train_sal$workclass,train_sal$Salary)

plot(train_sal$education,train_sal$Salary)

plot(train_sal$educationno,train_sal$Salary)

plot(train_sal$maritalstatus,train_sal$Salary)

plot(train_sal$occupation,train_sal$Salary)

plot(train_sal$relationship,train_sal$Salary)

plot(train_sal$race,train_sal$Salary)

plot(train_sal$sex,train_sal$Salary)

ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$capitalgain, fill = train_sal$Salary)) +
  geom_boxplot() +
  ggtitle("Box Plot")

ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$capitalloss, fill = train_sal$Salary)) +
  geom_boxplot() +
  ggtitle("Box Plot")

ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$hoursperweek, fill = train_sal$Salary)) +
  geom_boxplot() +
  ggtitle("Box Plot")

plot(train_sal$native,train_sal$Salary)

#Density Plot 

ggplot(data=train_sal,aes(x = train_sal$age, fill = train_sal$Salary)) +
  geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Age - Density Plot")
## $title
## [1] "Age - Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$workclass, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Workclass Density Plot")
## $title
## [1] "Workclass Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$education, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("education Density Plot")
## $title
## [1] "education Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$educationno, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("educationno Density Plot")
## $title
## [1] "educationno Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$maritalstatus, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("maritalstatus Density Plot")
## $title
## [1] "maritalstatus Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$occupation, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.

  ggtitle("occupation Density Plot")
## $title
## [1] "occupation Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$sex, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("sex Density Plot")
## $title
## [1] "sex Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$relationship, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Relationship Density Plot")
## $title
## [1] "Relationship Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$race, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Race Density Plot")
## $title
## [1] "Race Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$capitalgain, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Capitalgain Density Plot")
## $title
## [1] "Capitalgain Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$capitalloss, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Capitalloss Density Plot")
## $title
## [1] "Capitalloss Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$hoursperweek, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')

  ggtitle("Hoursperweek Density Plot")
## $title
## [1] "Hoursperweek Density Plot"
## 
## attr(,"class")
## [1] "labels"
  ggplot(data=train_sal,aes(x = train_sal$native, fill = train_sal$Salary)) +
    geom_density(alpha = 0.9, color = 'Violet')
## Warning: Groups with fewer than two data points have been dropped.

  ggtitle("native Density Plot")
## $title
## [1] "native Density Plot"
## 
## attr(,"class")
## [1] "labels"
  # Naive Bayes Model 
  Model <- naiveBayes(train_sal$Salary ~ ., data = train_sal)
  Model
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     <=50K      >50K 
## 0.7510693 0.2489307 
## 
## Conditional probabilities:
##         age
## Y            [,1]     [,2]
##    <=50K 36.60826 13.46489
##    >50K  43.95911 10.26963
## 
##         workclass
## Y         Federal-gov    Local-gov      Private  Self-emp-inc
##    <=50K 0.0255153843 0.0643623361 0.7685074825  0.0209243809
##    >50K  0.0486148109 0.0811134790 0.6494405967  0.0799147576
##         workclass
## Y         Self-emp-not-inc    State-gov  Without-pay
##    <=50K      0.0787975103 0.0412748863 0.0006180197
##    >50K       0.0950985615 0.0458177944 0.0000000000
## 
##         education
## Y                10th         11th         12th      1st-4th      5th-6th
##    <=50K 0.0335937845 0.0436586766 0.0153622037 0.0064009182 0.0121838167
##    >50K  0.0078582845 0.0078582845 0.0038625466 0.0007991476 0.0015982952
##         education
## Y             7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors
##    <=50K 0.0230433055 0.0189820333 0.0331964861 0.0425109257 0.1288129608
##    >50K  0.0046616942 0.0033297816 0.0340969632 0.0458177944 0.2831646244
##         education
## Y           Doctorate      HS-grad      Masters    Preschool  Prof-school
##    <=50K 0.0041937050 0.3629982784 0.0312982828 0.0019864919 0.0060036198
##    >50K  0.0372935535 0.2153702717 0.1222695791 0.0000000000 0.0540756526
##         education
## Y         Some-college
##    <=50K  0.2357745111
##    >50K   0.1779435269
## 
##         educationno
## Y                   1            2            3            4            5
##    <=50K 0.0019864919 0.0064009182 0.0121838167 0.0230433055 0.0189820333
##    >50K  0.0000000000 0.0007991476 0.0015982952 0.0046616942 0.0033297816
##         educationno
## Y                   6            7            8            9           10
##    <=50K 0.0335937845 0.0436586766 0.0153622037 0.3629982784 0.2357745111
##    >50K  0.0078582845 0.0078582845 0.0038625466 0.2153702717 0.1779435269
##         educationno
## Y                  11           12           13           14           15
##    <=50K 0.0425109257 0.0331964861 0.1288129608 0.0312982828 0.0060036198
##    >50K  0.0458177944 0.0340969632 0.2831646244 0.1222695791 0.0540756526
##         educationno
## Y                  16
##    <=50K 0.0041937050
##    >50K  0.0372935535
## 
##         maritalstatus
## Y            Divorced  Married-AF-spouse  Married-civ-spouse
##    <=50K 0.1660707191       0.0004855869        0.3384099236
##    >50K  0.0602024507       0.0013319126        0.8522908897
##         maritalstatus
## Y         Married-spouse-absent  Never-married    Separated      Widowed
##    <=50K           0.0149649053   0.4085551583 0.0385379420 0.0329757648
##    >50K            0.0041289291   0.0625998934 0.0087906233 0.0106553010
## 
##         occupation
## Y         Adm-clerical  Armed-Forces  Craft-repair  Exec-managerial
##    <=50K  0.1422769611  0.0003531541  0.1378183905     0.0907164614
##    >50K   0.0663292488  0.0001331913  0.1209376665     0.2579914758
##         occupation
## Y         Farming-fishing  Handlers-cleaners  Machine-op-inspct
##    <=50K     0.0385820863       0.0559307818       0.0759281331
##    >50K      0.0153169952       0.0110548748       0.0326318594
##         occupation
## Y         Other-service  Priv-house-serv  Prof-specialty  Protective-serv
##    <=50K   0.1359643314     0.0062684854    0.0983092747     0.0191586103
##    >50K    0.0175812467     0.0001331913    0.2412093767     0.0279701652
##         occupation
## Y               Sales  Tech-support  Transport-moving
##    <=50K 0.1153931047  0.0279874630      0.0553127621
##    >50K  0.1291955248  0.0370271710      0.0424880128
## 
##         relationship
## Y            Husband  Not-in-family  Other-relative   Own-child
##    <=50K 0.299474683    0.304727851     0.037655057 0.194323048
##    >50K  0.756393181    0.109616409     0.004661694 0.008524241
##         relationship
## Y          Unmarried        Wife
##    <=50K 0.132388646 0.031430716
##    >50K  0.028369739 0.092434736
## 
##         race
## Y         Amer-Indian-Eskimo  Asian-Pac-Islander       Black       Other
##    <=50K         0.011124354         0.028561338 0.108197590 0.009270295
##    >50K          0.004528503         0.033031433 0.048748002 0.002797017
##         race
## Y              White
##    <=50K 0.842846422
##    >50K  0.910895045
## 
##         sex
## Y           Female      Male
##    <=50K 0.3826866 0.6173134
##    >50K  0.1481087 0.8518913
## 
##         capitalgain
## Y             [,1]       [,2]
##    <=50K  148.9004   936.4124
##    >50K  3937.6798 14386.0600
## 
##         capitalloss
## Y             [,1]     [,2]
##    <=50K  53.35302 309.9476
##    >50K  193.75067 592.8256
## 
##         hoursperweek
## Y            [,1]     [,2]
##    <=50K 39.34856 11.95104
##    >50K  45.70658 10.73699
## 
##         native
## Y            Cambodia       Canada        China     Columbia         Cuba
##    <=50K 0.0004855869 0.0031342427 0.0021189246 0.0023837902 0.0029576657
##    >50K  0.0009323388 0.0047948855 0.0026638253 0.0002663825 0.0033297816
##         native
## Y         Dominican-Republic      Ecuador  El-Salvador      England
##    <=50K        0.0028693771 0.0010153181 0.0040171280 0.0024720788
##    >50K         0.0002663825 0.0005327651 0.0011987214 0.0039957379
##         native
## Y              France      Germany       Greece    Guatemala        Haiti
##    <=50K 0.0006621640 0.0037081181 0.0009270295 0.0026486558 0.0016774820
##    >50K  0.0015982952 0.0058604156 0.0010655301 0.0003995738 0.0005327651
##         native
## Y            Honduras         Hong      Hungary        India         Iran
##    <=50K 0.0004855869 0.0005738754 0.0004414426 0.0026486558 0.0010594623
##    >50K  0.0001331913 0.0007991476 0.0003995738 0.0053276505 0.0023974427
##         native
## Y             Ireland        Italy      Jamaica        Japan         Laos
##    <=50K 0.0008387410 0.0019423476 0.0030900984 0.0015891935 0.0006621640
##    >50K  0.0006659563 0.0031965903 0.0013319126 0.0030633990 0.0002663825
##         native
## Y              Mexico    Nicaragua  Outlying-US(Guam-USVI-etc)
##    <=50K 0.0254712400 0.0013684722                0.0006180197
##    >50K  0.0043953117 0.0002663825                0.0000000000
##         native
## Y                Peru  Philippines       Poland     Portugal  Puerto-Rico
##    <=50K 0.0012360394 0.0056504657 0.0019864919 0.0013243279 0.0042819936
##    >50K  0.0002663825 0.0079914758 0.0014651039 0.0005327651 0.0015982952
##         native
## Y            Scotland        South       Taiwan     Thailand
##    <=50K 0.0003972984 0.0025162230 0.0010153181 0.0006180197
##    >50K  0.0002663825 0.0018646777 0.0025306340 0.0003995738
##         native
## Y         Trinadad&Tobago  United-States      Vietnam   Yugoslavia
##    <=50K     0.0007063082   0.9053546992 0.0026045115 0.0004414426
##    >50K      0.0002663825   0.9316728823 0.0006659563 0.0007991476
Model_pred <- predict(Model,test_sal)
mean(Model_pred==test_sal$Salary)
## [1] 0.8187251
confusionMatrix(Model_pred,test_sal$Salary)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  <=50K  >50K
##      <=50K  10549  1919
##      >50K     811  1781
##                                           
##                Accuracy : 0.8187          
##                  95% CI : (0.8125, 0.8248)
##     No Information Rate : 0.7543          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.456           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9286          
##             Specificity : 0.4814          
##          Pos Pred Value : 0.8461          
##          Neg Pred Value : 0.6871          
##              Prevalence : 0.7543          
##          Detection Rate : 0.7005          
##    Detection Prevalence : 0.8279          
##       Balanced Accuracy : 0.7050          
##                                           
##        'Positive' Class :  <=50K          
##