df = read.csv("G:/RStudioWork/ionosphere.data",sep = ",", header = TRUE)
View(df)
colnames(df) <- c('a01','a02','a03','a04','a05','a06','a07','a08','a09','a10','a11','a12','a13','a14','a15','a16','a17','a18','a19','a20','a21','a22','a23','a24','a25','a26','a27','a28','a29','a30','a31','a32','a33','a34','class')
View(df)
colSums(is.na(df))
## a01 a02 a03 a04 a05 a06 a07 a08 a09 a10 a11
## 0 0 0 0 0 0 0 0 0 0 0
## a12 a13 a14 a15 a16 a17 a18 a19 a20 a21 a22
## 0 0 0 0 0 0 0 0 0 0 0
## a23 a24 a25 a26 a27 a28 a29 a30 a31 a32 a33
## 0 0 0 0 0 0 0 0 0 0 0
## a34 class
## 0 0
df$class = factor(df$class, levels = c("b","g"))
df$class = as.numeric(df$class)
df$class = factor(df$class, levels = c(1,2))
str(df)
## 'data.frame': 350 obs. of 35 variables:
## $ a01 : int 1 1 1 1 1 1 0 1 1 1 ...
## $ a02 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ a03 : num 1 1 1 1 0.0234 ...
## $ a04 : num -0.18829 -0.03365 -0.45161 -0.02401 -0.00592 ...
## $ a05 : num 0.9304 1 1 0.9414 -0.0992 ...
## $ a06 : num -0.36156 0.00485 1 0.06531 -0.11949 ...
## $ a07 : num -0.10868 1 0.71216 0.92106 -0.00763 ...
## $ a08 : num -0.936 -0.121 -1 -0.233 -0.118 ...
## $ a09 : num 1 0.89 0 0.772 0.147 ...
## $ a10 : num -0.0455 0.012 0 -0.164 0.0664 ...
## $ a11 : num 0.5087 0.7308 0 0.528 0.0379 ...
## $ a12 : num -0.6774 0.0535 0 -0.2028 -0.063 ...
## $ a13 : num 0.344 0.854 0 0.564 0 ...
## $ a14 : num -0.69707 0.00827 0 -0.00712 0 ...
## $ a15 : num -0.5169 0.5459 -1 0.3439 -0.0457 ...
## $ a16 : num -0.97515 0.00299 0.14516 -0.27457 -0.1554 ...
## $ a17 : num 0.05499 0.83775 0.54094 0.5294 -0.00343 ...
## $ a18 : num -0.622 -0.136 -0.393 -0.218 -0.102 ...
## $ a19 : num 0.331 0.755 -1 0.451 -0.116 ...
## $ a20 : num -1 -0.0854 -0.5447 -0.1781 -0.0541 ...
## $ a21 : num -0.1315 0.7089 -0.6997 0.0598 0.0184 ...
## $ a22 : num -0.453 -0.275 1 -0.3558 0.0367 ...
## $ a23 : num -0.1806 0.4339 0 0.0231 0.0152 ...
## $ a24 : num -0.35734 -0.12062 0 -0.52879 0.00888 ...
## $ a25 : num -0.2033 0.5753 1 0.0329 0.0351 ...
## $ a26 : num -0.2657 -0.4022 0.907 -0.6516 -0.0154 ...
## $ a27 : num -0.2047 0.5898 0.5161 0.1329 -0.0324 ...
## $ a28 : num -0.184 -0.2215 1 -0.5321 0.0922 ...
## $ a29 : num -0.1904 0.431 1 0.0243 -0.0786 ...
## $ a30 : num -0.11593 -0.17365 -0.20099 -0.62197 0.00732 ...
## $ a31 : num -0.1663 0.6044 0.2568 -0.0571 0 ...
## $ a32 : num -0.0629 -0.2418 1 -0.5957 0 ...
## $ a33 : num -0.13738 0.56045 -0.32382 -0.04608 -0.00039 ...
## $ a34 : num -0.0245 -0.3824 1 -0.657 0.1201 ...
## $ class: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2 ...
View(df)
boxplot(df,xlab="Features",ylab="Frequency",main="Ionosphere dataset 20MID0071")

hist(df$a24, main="20MID0071")

hist(df$a12, main="20MID0071")

hist(df$a30, main="20MID0071")

plot(x=df$a24,y=df$a20, main = "Scatter between features 20MID0071")

plot(x=df$a30,y=df$a31, main = "Scatter between features 2 20MID0071")

plot(x=df$a24,y=df$class, main = "Scatter between dependant and indepandant 20MID0071")

df2 <- df[, -35]
str(df2)
## 'data.frame': 350 obs. of 34 variables:
## $ a01: int 1 1 1 1 1 1 0 1 1 1 ...
## $ a02: int 0 0 0 0 0 0 0 0 0 0 ...
## $ a03: num 1 1 1 1 0.0234 ...
## $ a04: num -0.18829 -0.03365 -0.45161 -0.02401 -0.00592 ...
## $ a05: num 0.9304 1 1 0.9414 -0.0992 ...
## $ a06: num -0.36156 0.00485 1 0.06531 -0.11949 ...
## $ a07: num -0.10868 1 0.71216 0.92106 -0.00763 ...
## $ a08: num -0.936 -0.121 -1 -0.233 -0.118 ...
## $ a09: num 1 0.89 0 0.772 0.147 ...
## $ a10: num -0.0455 0.012 0 -0.164 0.0664 ...
## $ a11: num 0.5087 0.7308 0 0.528 0.0379 ...
## $ a12: num -0.6774 0.0535 0 -0.2028 -0.063 ...
## $ a13: num 0.344 0.854 0 0.564 0 ...
## $ a14: num -0.69707 0.00827 0 -0.00712 0 ...
## $ a15: num -0.5169 0.5459 -1 0.3439 -0.0457 ...
## $ a16: num -0.97515 0.00299 0.14516 -0.27457 -0.1554 ...
## $ a17: num 0.05499 0.83775 0.54094 0.5294 -0.00343 ...
## $ a18: num -0.622 -0.136 -0.393 -0.218 -0.102 ...
## $ a19: num 0.331 0.755 -1 0.451 -0.116 ...
## $ a20: num -1 -0.0854 -0.5447 -0.1781 -0.0541 ...
## $ a21: num -0.1315 0.7089 -0.6997 0.0598 0.0184 ...
## $ a22: num -0.453 -0.275 1 -0.3558 0.0367 ...
## $ a23: num -0.1806 0.4339 0 0.0231 0.0152 ...
## $ a24: num -0.35734 -0.12062 0 -0.52879 0.00888 ...
## $ a25: num -0.2033 0.5753 1 0.0329 0.0351 ...
## $ a26: num -0.2657 -0.4022 0.907 -0.6516 -0.0154 ...
## $ a27: num -0.2047 0.5898 0.5161 0.1329 -0.0324 ...
## $ a28: num -0.184 -0.2215 1 -0.5321 0.0922 ...
## $ a29: num -0.1904 0.431 1 0.0243 -0.0786 ...
## $ a30: num -0.11593 -0.17365 -0.20099 -0.62197 0.00732 ...
## $ a31: num -0.1663 0.6044 0.2568 -0.0571 0 ...
## $ a32: num -0.0629 -0.2418 1 -0.5957 0 ...
## $ a33: num -0.13738 0.56045 -0.32382 -0.04608 -0.00039 ...
## $ a34: num -0.0245 -0.3824 1 -0.657 0.1201 ...
df2 = matrix(df2)
library(ggplot2)
ggplot(df, aes(x = a24, y = a32)) +
geom_point(aes(color = class, size = class))+
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07")) # Adjust the range of points size
## Warning: Using size for a discrete variable is not advised.

barplot(df$a03, xlab = "Df$a03", ylab = "Range", main = "BarPlot of Df$a03 20MID0071")

barplot(df$a24, xlab = "Df$a24", ylab = "Range",main = "BarPlot of Df$a24 20MID0071")

barplot(df$a30, xlab = "Df$a30", ylab = "Range",main = "BarPlot of Df$a30 20MID0071")

plot(x=df$a24, y= df$a28, type = "o", xlab="df$a24", main="20MID0071")

plot(x=df$a24, y= df$class, type = "o", xlab="df$24", ylab = "df$class", main="20MID0071")

# Model 1
df = df[23:35]
str(df)
## 'data.frame': 350 obs. of 13 variables:
## $ a23 : num -0.1806 0.4339 0 0.0231 0.0152 ...
## $ a24 : num -0.35734 -0.12062 0 -0.52879 0.00888 ...
## $ a25 : num -0.2033 0.5753 1 0.0329 0.0351 ...
## $ a26 : num -0.2657 -0.4022 0.907 -0.6516 -0.0154 ...
## $ a27 : num -0.2047 0.5898 0.5161 0.1329 -0.0324 ...
## $ a28 : num -0.184 -0.2215 1 -0.5321 0.0922 ...
## $ a29 : num -0.1904 0.431 1 0.0243 -0.0786 ...
## $ a30 : num -0.11593 -0.17365 -0.20099 -0.62197 0.00732 ...
## $ a31 : num -0.1663 0.6044 0.2568 -0.0571 0 ...
## $ a32 : num -0.0629 -0.2418 1 -0.5957 0 ...
## $ a33 : num -0.13738 0.56045 -0.32382 -0.04608 -0.00039 ...
## $ a34 : num -0.0245 -0.3824 1 -0.657 0.1201 ...
## $ class: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2 ...
library(caTools)
set.seed(45)
split = sample.split(Y = df$class, SplitRatio = 0.8)
train_set = subset(df,split=TRUE)
test_set = subset(df,split=FALSE)
train_set[,-13] = scale(train_set[,-13])
test_set[,-13] = scale(test_set[,-13])
library(class)
pred = knn(train = train_set[,-13],
test = test_set[-13],
cl = train_set[,13],k=5)
cm = table(test_set[,13],pred)
cm
## pred
## 1 2
## 1 88 38
## 2 2 222
score = sum(diag(cm))/sum(cm)
score
## [1] 0.8857143
# Model 2 Logistic Regression
classifier = glm(formula = class~.,family = binomial(),
data = train_set)
summary(classifier)
##
## Call:
## glm(formula = class ~ ., family = binomial(), data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0158 -0.6204 0.5886 0.7930 1.5734
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.570578 0.132875 4.294 1.75e-05 ***
## a23 0.511844 0.211063 2.425 0.01531 *
## a24 0.099283 0.176227 0.563 0.57317
## a25 0.128464 0.200476 0.641 0.52166
## a26 0.003728 0.169890 0.022 0.98249
## a27 -1.550288 0.256610 -6.041 1.53e-09 ***
## a28 0.051294 0.218768 0.234 0.81462
## a29 0.664754 0.240499 2.764 0.00571 **
## a30 0.408897 0.179141 2.283 0.02246 *
## a31 0.461484 0.225299 2.048 0.04053 *
## a32 0.083709 0.205502 0.407 0.68376
## a33 0.536791 0.221951 2.419 0.01558 *
## a34 -0.345187 0.177437 -1.945 0.05173 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 457.39 on 349 degrees of freedom
## Residual deviance: 358.05 on 337 degrees of freedom
## AIC: 384.05
##
## Number of Fisher Scoring iterations: 5
y_pred = predict(object = classifier,
type = 'response',
newdata = test_set)
y_pred =ifelse(y_pred>0.5,1,0)
cm = table(test_set[,13], y_pred)
cm
## y_pred
## 0 1
## 1 65 61
## 2 8 216
sum(diag(cm))/sum(cm)
## [1] 0.8028571