df <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 3 - Classification\\Section 14 - Logistic Regression\\Logistic_Regression\\Social_Network_Ads.csv")
head(df)
df <- df[,3:5]
head(df)
library(caTools)
package <U+393C><U+3E31>caTools<U+393C><U+3E32> was built under R version 3.3.3
set.seed(1234)
split <- sample.split(df$Purchased, SplitRatio = 0.75)
training_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)
# Feature Scaling 1 age, 2 is salary
training_set[,1:2] <- scale(training_set[,1:2])
test_set[,1:2] <- scale(test_set[,1:2])
classifier <- glm(formula = Purchased ~ Age + EstimatedSalary,
family = binomial, data = training_set)
summary(classifier)
Call:
glm(formula = Purchased ~ Age + EstimatedSalary, family = binomial,
data = training_set)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.9098 -0.5571 -0.1744 0.3933 2.2917
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.0890 0.1911 -5.700 1.20e-08 ***
Age 2.3769 0.2983 7.969 1.60e-15 ***
EstimatedSalary 1.1488 0.2161 5.316 1.06e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 390.89 on 299 degrees of freedom
Residual deviance: 209.32 on 297 degrees of freedom
AIC: 215.32
Number of Fisher Scoring iterations: 6
prob_pred <- predict(classifier, type = 'response', newdata = test_set[-3]
)
y_pred <- ifelse(prob_pred > 0.5, 1, 0)
y_pred
5 15 19 25 29 40 42 43 49 51 54 55 66 74 76 77 78 90 92 100
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
104 106 110 112 120 133 136 137 141 142 143 144 145 153 157 159 164 166 172 174
1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
179 181 183 184 195 197 202 203 216 218 221 223 228 229 235 238 239 240 242 244
0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 1 1 0 1
245 249 250 254 256 258 271 273 275 286 287 290 291 294 301 303 305 309 318 319
0 0 0 1 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 0
327 330 332 333 338 341 343 351 353 354 357 360 361 363 368 369 373 386 387 394
0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0 1 1 1
# Making the confusion matrix
# [3] refers to the outcome
cm <- table(test_set[,3], y_pred)
cm
y_pred
0 1
0 59 5
1 9 27
# install.packages("ElemStatLearn")
library(ElemStatLearn)
set <- training_set
X1 <- seq(min(set[,1]) - 1, max(set[,1]) + 1, by = 0.01)
X2 <- seq(min(set[,2]) - 1, max(set[,2]) + 1, by = 0.01)
grid_set <- expand.grid(X1, X2)
colnames(grid_set) = c('Age','EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[,-3],
main = 'Logistic Regression(Training Set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg= ifelse(set[,3] ==1,'green4', 'red3'))
The red region is predicted by the classifier as “Dont buy” The green region is predicted by the classifier as “Buy” The red dots are those people that actually did not buy The green dots are those people that actually bought. The line is the prediction boundary.
set <- test_set
X1 <- seq(min(set[,1]) - 1, max(set[,1]) + 1, by = 0.01)
X2 <- seq(min(set[,2]) - 1, max(set[,2]) + 1, by = 0.01)
grid_set <- expand.grid(X1, X2)
colnames(grid_set) = c('Age','EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[,-3],
main = 'Logistic Regression(Test Set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg= ifelse(set[,3] == 1,'green4', 'red3'))