TheoryDa.R

df = read.csv("G:/RStudioWork/ionosphere.data",sep = ",", header = TRUE)
View(df)
colnames(df) <- c('a01','a02','a03','a04','a05','a06','a07','a08','a09','a10','a11','a12','a13','a14','a15','a16','a17','a18','a19','a20','a21','a22','a23','a24','a25','a26','a27','a28','a29','a30','a31','a32','a33','a34','class')
View(df)
colSums(is.na(df))

##   a01   a02   a03   a04   a05   a06   a07   a08   a09   a10   a11 
##     0     0     0     0     0     0     0     0     0     0     0 
##   a12   a13   a14   a15   a16   a17   a18   a19   a20   a21   a22 
##     0     0     0     0     0     0     0     0     0     0     0 
##   a23   a24   a25   a26   a27   a28   a29   a30   a31   a32   a33 
##     0     0     0     0     0     0     0     0     0     0     0 
##   a34 class 
##     0     0

df$class = factor(df$class, levels = c("b","g"))
df$class = as.numeric(df$class)
df$class = factor(df$class, levels = c(1,2))
str(df)

## 'data.frame':    350 obs. of  35 variables:
##  $ a01  : int  1 1 1 1 1 1 0 1 1 1 ...
##  $ a02  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ a03  : num  1 1 1 1 0.0234 ...
##  $ a04  : num  -0.18829 -0.03365 -0.45161 -0.02401 -0.00592 ...
##  $ a05  : num  0.9304 1 1 0.9414 -0.0992 ...
##  $ a06  : num  -0.36156 0.00485 1 0.06531 -0.11949 ...
##  $ a07  : num  -0.10868 1 0.71216 0.92106 -0.00763 ...
##  $ a08  : num  -0.936 -0.121 -1 -0.233 -0.118 ...
##  $ a09  : num  1 0.89 0 0.772 0.147 ...
##  $ a10  : num  -0.0455 0.012 0 -0.164 0.0664 ...
##  $ a11  : num  0.5087 0.7308 0 0.528 0.0379 ...
##  $ a12  : num  -0.6774 0.0535 0 -0.2028 -0.063 ...
##  $ a13  : num  0.344 0.854 0 0.564 0 ...
##  $ a14  : num  -0.69707 0.00827 0 -0.00712 0 ...
##  $ a15  : num  -0.5169 0.5459 -1 0.3439 -0.0457 ...
##  $ a16  : num  -0.97515 0.00299 0.14516 -0.27457 -0.1554 ...
##  $ a17  : num  0.05499 0.83775 0.54094 0.5294 -0.00343 ...
##  $ a18  : num  -0.622 -0.136 -0.393 -0.218 -0.102 ...
##  $ a19  : num  0.331 0.755 -1 0.451 -0.116 ...
##  $ a20  : num  -1 -0.0854 -0.5447 -0.1781 -0.0541 ...
##  $ a21  : num  -0.1315 0.7089 -0.6997 0.0598 0.0184 ...
##  $ a22  : num  -0.453 -0.275 1 -0.3558 0.0367 ...
##  $ a23  : num  -0.1806 0.4339 0 0.0231 0.0152 ...
##  $ a24  : num  -0.35734 -0.12062 0 -0.52879 0.00888 ...
##  $ a25  : num  -0.2033 0.5753 1 0.0329 0.0351 ...
##  $ a26  : num  -0.2657 -0.4022 0.907 -0.6516 -0.0154 ...
##  $ a27  : num  -0.2047 0.5898 0.5161 0.1329 -0.0324 ...
##  $ a28  : num  -0.184 -0.2215 1 -0.5321 0.0922 ...
##  $ a29  : num  -0.1904 0.431 1 0.0243 -0.0786 ...
##  $ a30  : num  -0.11593 -0.17365 -0.20099 -0.62197 0.00732 ...
##  $ a31  : num  -0.1663 0.6044 0.2568 -0.0571 0 ...
##  $ a32  : num  -0.0629 -0.2418 1 -0.5957 0 ...
##  $ a33  : num  -0.13738 0.56045 -0.32382 -0.04608 -0.00039 ...
##  $ a34  : num  -0.0245 -0.3824 1 -0.657 0.1201 ...
##  $ class: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2 ...

View(df)
boxplot(df,xlab="Features",ylab="Frequency",main="Ionosphere dataset 20MID0071")

hist(df$a24, main="20MID0071")

hist(df$a12, main="20MID0071")

hist(df$a30, main="20MID0071")

plot(x=df$a24,y=df$a20, main = "Scatter between features 20MID0071")

plot(x=df$a30,y=df$a31, main = "Scatter between features 2 20MID0071")

plot(x=df$a24,y=df$class, main = "Scatter between dependant and indepandant 20MID0071")

df2 <- df[, -35]
str(df2)

## 'data.frame':    350 obs. of  34 variables:
##  $ a01: int  1 1 1 1 1 1 0 1 1 1 ...
##  $ a02: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ a03: num  1 1 1 1 0.0234 ...
##  $ a04: num  -0.18829 -0.03365 -0.45161 -0.02401 -0.00592 ...
##  $ a05: num  0.9304 1 1 0.9414 -0.0992 ...
##  $ a06: num  -0.36156 0.00485 1 0.06531 -0.11949 ...
##  $ a07: num  -0.10868 1 0.71216 0.92106 -0.00763 ...
##  $ a08: num  -0.936 -0.121 -1 -0.233 -0.118 ...
##  $ a09: num  1 0.89 0 0.772 0.147 ...
##  $ a10: num  -0.0455 0.012 0 -0.164 0.0664 ...
##  $ a11: num  0.5087 0.7308 0 0.528 0.0379 ...
##  $ a12: num  -0.6774 0.0535 0 -0.2028 -0.063 ...
##  $ a13: num  0.344 0.854 0 0.564 0 ...
##  $ a14: num  -0.69707 0.00827 0 -0.00712 0 ...
##  $ a15: num  -0.5169 0.5459 -1 0.3439 -0.0457 ...
##  $ a16: num  -0.97515 0.00299 0.14516 -0.27457 -0.1554 ...
##  $ a17: num  0.05499 0.83775 0.54094 0.5294 -0.00343 ...
##  $ a18: num  -0.622 -0.136 -0.393 -0.218 -0.102 ...
##  $ a19: num  0.331 0.755 -1 0.451 -0.116 ...
##  $ a20: num  -1 -0.0854 -0.5447 -0.1781 -0.0541 ...
##  $ a21: num  -0.1315 0.7089 -0.6997 0.0598 0.0184 ...
##  $ a22: num  -0.453 -0.275 1 -0.3558 0.0367 ...
##  $ a23: num  -0.1806 0.4339 0 0.0231 0.0152 ...
##  $ a24: num  -0.35734 -0.12062 0 -0.52879 0.00888 ...
##  $ a25: num  -0.2033 0.5753 1 0.0329 0.0351 ...
##  $ a26: num  -0.2657 -0.4022 0.907 -0.6516 -0.0154 ...
##  $ a27: num  -0.2047 0.5898 0.5161 0.1329 -0.0324 ...
##  $ a28: num  -0.184 -0.2215 1 -0.5321 0.0922 ...
##  $ a29: num  -0.1904 0.431 1 0.0243 -0.0786 ...
##  $ a30: num  -0.11593 -0.17365 -0.20099 -0.62197 0.00732 ...
##  $ a31: num  -0.1663 0.6044 0.2568 -0.0571 0 ...
##  $ a32: num  -0.0629 -0.2418 1 -0.5957 0 ...
##  $ a33: num  -0.13738 0.56045 -0.32382 -0.04608 -0.00039 ...
##  $ a34: num  -0.0245 -0.3824 1 -0.657 0.1201 ...

df2 = matrix(df2)
library(ggplot2)
ggplot(df, aes(x = a24, y = a32)) +
  geom_point(aes(color = class, size = class))+
  scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))  # Adjust the range of points size

## Warning: Using size for a discrete variable is not advised.

barplot(df$a03, xlab = "Df$a03", ylab = "Range", main = "BarPlot of Df$a03 20MID0071")

barplot(df$a24, xlab = "Df$a24", ylab = "Range",main = "BarPlot of Df$a24 20MID0071")

barplot(df$a30, xlab = "Df$a30", ylab = "Range",main = "BarPlot of Df$a30 20MID0071")

plot(x=df$a24, y= df$a28, type = "o", xlab="df$a24", main="20MID0071")

plot(x=df$a24, y= df$class, type = "o", xlab="df$24", ylab = "df$class", main="20MID0071")

# Model 1
df = df[23:35]
str(df)

## 'data.frame':    350 obs. of  13 variables:
##  $ a23  : num  -0.1806 0.4339 0 0.0231 0.0152 ...
##  $ a24  : num  -0.35734 -0.12062 0 -0.52879 0.00888 ...
##  $ a25  : num  -0.2033 0.5753 1 0.0329 0.0351 ...
##  $ a26  : num  -0.2657 -0.4022 0.907 -0.6516 -0.0154 ...
##  $ a27  : num  -0.2047 0.5898 0.5161 0.1329 -0.0324 ...
##  $ a28  : num  -0.184 -0.2215 1 -0.5321 0.0922 ...
##  $ a29  : num  -0.1904 0.431 1 0.0243 -0.0786 ...
##  $ a30  : num  -0.11593 -0.17365 -0.20099 -0.62197 0.00732 ...
##  $ a31  : num  -0.1663 0.6044 0.2568 -0.0571 0 ...
##  $ a32  : num  -0.0629 -0.2418 1 -0.5957 0 ...
##  $ a33  : num  -0.13738 0.56045 -0.32382 -0.04608 -0.00039 ...
##  $ a34  : num  -0.0245 -0.3824 1 -0.657 0.1201 ...
##  $ class: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2 ...

library(caTools)
set.seed(45)
split = sample.split(Y = df$class, SplitRatio = 0.8)
train_set = subset(df,split=TRUE)
test_set = subset(df,split=FALSE)
train_set[,-13] = scale(train_set[,-13])
test_set[,-13] = scale(test_set[,-13])
library(class)
pred = knn(train = train_set[,-13],
           test = test_set[-13],
           cl = train_set[,13],k=5)
cm = table(test_set[,13],pred)
cm

##    pred
##       1   2
##   1  88  38
##   2   2 222

score = sum(diag(cm))/sum(cm)
score

## [1] 0.8857143

# Model 2 Logistic Regression
classifier = glm(formula = class~.,family = binomial(),
                 data = train_set)
summary(classifier)

## 
## Call:
## glm(formula = class ~ ., family = binomial(), data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0158  -0.6204   0.5886   0.7930   1.5734  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.570578   0.132875   4.294 1.75e-05 ***
## a23          0.511844   0.211063   2.425  0.01531 *  
## a24          0.099283   0.176227   0.563  0.57317    
## a25          0.128464   0.200476   0.641  0.52166    
## a26          0.003728   0.169890   0.022  0.98249    
## a27         -1.550288   0.256610  -6.041 1.53e-09 ***
## a28          0.051294   0.218768   0.234  0.81462    
## a29          0.664754   0.240499   2.764  0.00571 ** 
## a30          0.408897   0.179141   2.283  0.02246 *  
## a31          0.461484   0.225299   2.048  0.04053 *  
## a32          0.083709   0.205502   0.407  0.68376    
## a33          0.536791   0.221951   2.419  0.01558 *  
## a34         -0.345187   0.177437  -1.945  0.05173 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 457.39  on 349  degrees of freedom
## Residual deviance: 358.05  on 337  degrees of freedom
## AIC: 384.05
## 
## Number of Fisher Scoring iterations: 5

y_pred = predict(object = classifier,
                 type = 'response',
                 newdata = test_set)
y_pred =ifelse(y_pred>0.5,1,0)
cm = table(test_set[,13], y_pred)
cm

##    y_pred
##       0   1
##   1  65  61
##   2   8 216

sum(diag(cm))/sum(cm)

## [1] 0.8028571

TheoryDa.R

Shantanu Dahitule

2022-11-09