library(rpart) # For decision tree classifier
library(e1071) # For Naive bayes classifier
m<-read.csv("C:/Users/pradeep/OneDrive/datasets/students_placement_data_ms.csv")
head(m)
## Roll.No Gender Section SSC.Percentage inter_Diploma_percentage
## 1 1 M A 87.30 65.3
## 2 2 F B 89.00 92.4
## 3 3 F A 67.00 68.0
## 4 4 M A 71.00 70.4
## 5 5 M A 67.00 65.5
## 6 6 M A 81.26 68.0
## B.Tech_percentage Backlogs registered_for_.Placement_Training
## 1 40.00 18 NO
## 2 71.45 0 yes
## 3 45.26 13 yes
## 4 36.47 17 yes
## 5 42.52 17 yes
## 6 62.20 6 yes
## placement.status
## 1 Not placed
## 2 Placed
## 3 Not placed
## 4 Not placed
## 5 Not placed
## 6 Not placed
Here NA are removed using the attribute mean technique.
print(anyNA(m$SSC.Percentage))# Find for any NA in SSC.Percentage.
## [1] TRUE
print(anyNA(m$inter_Diploma_percentage))# Find for any NA in inter_Diploma_percentage
## [1] TRUE
print(anyNA(m$B.Tech_percentage))# Find for any NA in B.Tech_percentage
## [1] FALSE
# Remove NA's using mean in SSC percentage
m$SSC.Percentage[is.na(m$SSC.Percentage)]<-mean(m$SSC.Percentage,na.rm = TRUE) # NA's are not considered while calculating mean
anyNA(m$SSC.Percentage)
## [1] FALSE
# Remove NA's using mean in Intermediate percentage
m$inter_Diploma_percentage[is.na(m$inter_Diploma_percentage)]<-mean(m$inter_Diploma_percentage,na.rm = TRUE) # NA's are not considered while calculating mean
anyNA(m$inter_Diploma_percentage)
## [1] FALSE
#Draw a histogram for SSC percentage
hist(m$SSC.Percentage)
# Draw a histogram fo inter percentage
hist(m$inter_Diploma_percentage)
# Draw a histogram for B.Tech percentage
hist(m$B.Tech_percentage)
#Draw a bar chart for gender
barplot(table(m$Gender))
# Draw a bar chart for placement status
barplot(table(m$placement.status))
n=nrow(m) # Number of rows in the dataset
print(n)
## [1] 117
set.seed(101)
# Here 85 percent is training data and 15 percent is test data.
data_index=sample(1:n, size = round(0.85*n),replace = TRUE)
train_data=m[data_index,]
test_data=m[-data_index,]
# Now, let's check the structure of train_data and test_data.
str(train_data)
## 'data.frame': 99 obs. of 9 variables:
## $ Roll.No : int 44 6 84 77 30 36 69 40 73 64 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 1 2 ...
## $ Section : Factor w/ 2 levels "A","B": 2 1 1 1 2 2 1 2 2 1 ...
## $ SSC.Percentage : num 86 81.3 89 78 72 ...
## $ inter_Diploma_percentage : num 92.5 68 88.9 59 88.1 90 61 88.8 83.7 69.2 ...
## $ B.Tech_percentage : num 70.8 62.2 63 51.1 69.6 ...
## $ Backlogs : int 0 6 1 17 0 0 6 0 0 20 ...
## $ registered_for_.Placement_Training: Factor w/ 2 levels "NO","yes": 2 2 1 1 2 2 1 2 2 2 ...
## $ placement.status : Factor w/ 2 levels "Not placed","Placed": 1 1 1 1 2 2 1 1 1 1 ...
str(test_data)
## 'data.frame': 49 obs. of 9 variables:
## $ Roll.No : int 1 4 7 8 11 12 14 15 17 18 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 1 2 1 1 ...
## $ Section : Factor w/ 2 levels "A","B": 1 1 1 1 2 1 2 1 2 2 ...
## $ SSC.Percentage : num 87.3 71 71 84.8 82.3 ...
## $ inter_Diploma_percentage : num 65.3 70.4 56.5 79.3 76.3 66 88.7 52.2 85 95.1 ...
## $ B.Tech_percentage : num 40 36.5 33.8 61 71.5 ...
## $ Backlogs : int 18 17 20 3 0 16 0 7 0 0 ...
## $ registered_for_.Placement_Training: Factor w/ 2 levels "NO","yes": 1 2 2 1 1 2 2 2 2 2 ...
## $ placement.status : Factor w/ 2 levels "Not placed","Placed": 1 1 1 1 2 1 2 1 1 2 ...
# placement status is class label
stu_model<-rpart(formula =placement.status~ Backlogs+Gender+B.Tech_percentage+SSC.Percentage+inter_Diploma_percentage, data=train_data,method = "class",parms = list(split="gini"))
p<-predict(stu_model,test_data,type="class")
print(p)
## 1 4 7 8 11 12
## Not placed Not placed Not placed Not placed Not placed Not placed
## 14 15 17 18 19 21
## Placed Not placed Placed Placed Not placed Placed
## 23 26 31 32 34 35
## Not placed Not placed Placed Not placed Not placed Not placed
## 37 41 42 43 45 55
## Not placed Placed Not placed Not placed Not placed Not placed
## 56 57 58 59 60 63
## Placed Not placed Not placed Placed Placed Not placed
## 65 66 68 71 74 75
## Placed Placed Placed Placed Not placed Not placed
## 76 85 87 88 89 93
## Not placed Not placed Not placed Not placed Not placed Not placed
## 100 101 102 105 106 114
## Not placed Not placed Not placed Not placed Not placed Not placed
## 116
## Placed
## Levels: Not placed Placed
t<-table(test_data[,9],p)
print(t)
## p
## Not placed Placed
## Not placed 29 2
## Placed 6 12
# Accuracy of decision tree
print(sum(diag(t))/sum(t))
## [1] 0.8367347
#placement status is class label.
stu_model1<-naiveBayes(placement.status~ Backlogs+Gender+B.Tech_percentage+SSC.Percentage+inter_Diploma_percentage, data=train_data)
# Now let's do prediction.
q<-predict(stu_model1,test_data,type="class")
q
## [1] Not placed Not placed Not placed Not placed Placed Not placed
## [7] Placed Not placed Placed Placed Placed Placed
## [13] Placed Not placed Placed Not placed Not placed Placed
## [19] Placed Placed Not placed Not placed Placed Not placed
## [25] Placed Not placed Not placed Placed Placed Not placed
## [31] Placed Placed Placed Placed Not placed Placed
## [37] Not placed Placed Not placed Not placed Not placed Placed
## [43] Placed Not placed Placed Not placed Not placed Placed
## [49] Placed
## Levels: Not placed Placed
t1<-table(test_data[,9],q)
print(t1)
## q
## Not placed Placed
## Not placed 22 9
## Placed 1 17
print(sum(diag(t1))/sum(t1))
## [1] 0.7959184