library(cvms) #plot confusion matrix
library(broom) # tidy()
library(tibble) # tibble()
library(caret) #confusionMatrix
df<- read.csv("Kid.csv")
table(is.na(df))
##
## FALSE
## 11441
names(df)
## [1] "Buy" "Income" "Is.Female" "Is.Married"
## [5] "Has.College" "Is.Professional" "Is.Retired" "Unemployed"
## [9] "Residence.Length" "Dual.Income" "Minors" "Own"
## [13] "House" "Is.USBorn" "English" "Prev.Child.Mag"
## [17] "Prev.Parent.Mag"
df$Buy<- factor(df$Buy, c("0", "1"))
train_idx<- sample(dim(df)[1], 0.8*dim(df)[1] )
train<- df[train_idx, ]
test<- df[-train_idx, ]
dim(train)
## [1] 538 17
dim(test)
## [1] 135 17
logreg<- glm(Buy ~ ., data= train, family= "binomial" )
summary (logreg)
##
## Call:
## glm(formula = Buy ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.28550 -0.07603 -0.00638 -0.00092 2.61675
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.929e+01 2.668e+00 -7.228 4.91e-13 ***
## Income 2.148e-04 2.872e-05 7.481 7.37e-14 ***
## Is.Female 2.013e+00 5.487e-01 3.668 0.000244 ***
## Is.Married 2.778e-01 6.969e-01 0.399 0.690155
## Has.College -2.471e-01 5.111e-01 -0.483 0.628856
## Is.Professional 1.023e-01 5.306e-01 0.193 0.847053
## Is.Retired -1.054e+00 1.081e+00 -0.975 0.329687
## Unemployed 7.605e-01 5.432e+00 0.140 0.888649
## Residence.Length 1.180e-02 1.572e-02 0.751 0.452900
## Dual.Income 6.539e-01 6.307e-01 1.037 0.299865
## Minors 1.271e+00 5.234e-01 2.428 0.015184 *
## Own 1.461e+00 6.604e-01 2.212 0.026958 *
## House -8.635e-01 7.082e-01 -1.219 0.222733
## Is.USBorn 2.199e+00 6.226e-01 3.532 0.000412 ***
## English 1.803e+00 9.008e-01 2.001 0.045383 *
## Prev.Child.Mag 7.986e-01 8.157e-01 0.979 0.327521
## Prev.Parent.Mag 2.814e-01 7.111e-01 0.396 0.692334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 522.54 on 537 degrees of freedom
## Residual deviance: 140.61 on 521 degrees of freedom
## AIC: 174.61
##
## Number of Fisher Scoring iterations: 9
df_reduced<- df[, c(1, 2,3,10, 11, 14, 16)]
train_idx<- sample(dim(df_reduced)[1], 0.8*dim(df_reduced)[1] )
train<- df_reduced[train_idx, ]
test<- df_reduced[-train_idx, ]
dim(train)
## [1] 538 7
dim(test)
## [1] 135 7
logreg<- glm(Buy ~ ., data= train, family= "binomial" )
summary (logreg)
##
## Call:
## glm(formula = Buy ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.07903 -0.12336 -0.02152 -0.00568 2.73415
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.390e+01 1.669e+00 -8.326 < 2e-16 ***
## Income 1.721e-04 2.042e-05 8.425 < 2e-16 ***
## Is.Female 1.314e+00 4.369e-01 3.009 0.002622 **
## Dual.Income 1.431e+00 4.294e-01 3.333 0.000859 ***
## Minors 8.802e-01 4.226e-01 2.083 0.037260 *
## Is.USBorn 1.643e+00 5.519e-01 2.978 0.002906 **
## Prev.Child.Mag 7.760e-01 7.977e-01 0.973 0.330615
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 495.39 on 537 degrees of freedom
## Residual deviance: 161.12 on 531 degrees of freedom
## AIC: 175.12
##
## Number of Fisher Scoring iterations: 8
pr<- predict(logreg, test, type= "response")
pr.classes<- ifelse(pr > 0.5, "1", "0")
pr.classes<- factor(pr.classes, c("0", "1"))
cm<- confusionMatrix(pr.classes, test$Buy)
d_binomial <- tibble("target" = test$Buy,"prediction" = pr.classes)
basic_table <- table(d_binomial)
cfm <- tidy(basic_table)
plot_confusion_matrix(cfm, target_col = "target", prediction_col = "prediction", counts_col = "n")
Logistic regression: StatQuest - Youtube https://www.youtube.com/watch?v=C4N3_XJJ-jU&t=36s
Linear models https://www.statmethods.net/advstats/glm.html
Logistic regression http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ Plotting confusion matrix https://cran.r-project.org/web/packages/cvms/vignettes/Creating_a_confusion_matrix.html