This dataset has 24 rows and 2 columns as shown below.
options(digits = 2)
df <- read.csv("../data/lawnmowers.csv")
df
## income lotsize owner
## 1 60 18 1
## 2 86 17 1
## 3 65 22 1
## 4 62 21 1
## 5 87 24 1
## 6 110 19 1
## 7 108 18 1
## 8 83 22 1
## 9 69 20 1
## 10 93 21 1
## 11 51 22 1
## 12 81 20 1
## 13 75 20 0
## 14 53 21 0
## 15 65 17 0
## 16 43 20 0
## 17 84 18 0
## 18 49 18 0
## 19 59 16 0
## 20 66 18 0
## 21 47 16 0
## 22 33 19 0
## 23 51 14 0
## 24 63 15 0
##Change the colname to 'actual' from 'owner'
names(df)[names(df) == 'owner'] <- 'actual'
##Predict ownership using income and lot size
reg1 <- glm(actual ~ ., data = df, family = "binomial")
options(scipen=999)
summary(reg1)
##
## Call:
## glm(formula = actual ~ ., family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7404 -0.2969 0.0044 0.4475 1.8682
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -25.9382 11.4871 -2.26 0.024 *
## income 0.1109 0.0543 2.04 0.041 *
## lotsize 0.9638 0.4628 2.08 0.037 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.271 on 23 degrees of freedom
## Residual deviance: 15.323 on 21 degrees of freedom
## AIC: 21.32
##
## Number of Fisher Scoring iterations: 6
## Now calculate the overall "Pseudo R-squared" and its p-value
## NOTE: Since we are doing logistic regression...
## Null deviance = 2*(0 - LogLikelihood(null model))
## = -2*LogLikelihood(null model)
## Residual deviance = -2*LogLikelihood(proposed model)
ll.null <- reg1$null.deviance/-2
ll.proposed <- reg1$deviance/-2
## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null)
(ll.null - ll.proposed) / ll.null
## [1] 0.54
## chi-square value = 2*(LL(Proposed) - LL(Null))
## p-value = 1 - pchisq(chi-square value, df = 2-1)
1 - pchisq(2*(ll.proposed - ll.null), df=1)
## [1] 0.000023
1 - pchisq((reg1$null.deviance - reg1$deviance), df=1)
## [1] 0.000023
##Predict ownership using lot size only
reg2 <- glm(actual ~ lotsize, data = df, family = "binomial")
summary(reg2)
##
## Call:
## glm(formula = actual ~ lotsize, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -0.8297 0.0175 0.7817 1.8015
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.187 5.241 -2.33 0.020 *
## lotsize 0.642 0.274 2.34 0.019 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.271 on 23 degrees of freedom
## Residual deviance: 24.718 on 22 degrees of freedom
## AIC: 28.72
##
## Number of Fisher Scoring iterations: 4
psuedoR2 <- 1-reg2$deviance/reg2$null.deviance
psuedoR2
## [1] 0.26
##Predict ownership using income only
reg3 <- glm(actual ~ income, data = df, family = "binomial")
summary(reg3)
##
## Call:
## glm(formula = actual ~ income, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8139 -0.7576 -0.0474 0.6980 1.8035
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.7937 2.4695 -2.35 0.019 *
## income 0.0860 0.0366 2.35 0.019 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.271 on 23 degrees of freedom
## Residual deviance: 23.984 on 22 degrees of freedom
## AIC: 27.98
##
## Number of Fisher Scoring iterations: 5
psuedoR2 <- 1-reg3$deviance/reg3$null.deviance
psuedoR2
## [1] 0.28
##Predict ownership using categorical lot size only
meanLot <- mean(df$lotsize)
df[df$lotsize > meanLot, ]$lotsize = "High"
df[df$lotsize <= meanLot, ]$lotsize = "Low"
df$lotsize <- as.factor(df$lotsize)
xtabs(~ actual + lotsize, data = df)
## lotsize
## actual High Low
## 0 3 9
## 1 9 3
## lotsize
##actual High Low
# 0 3 9
# 1 9 3
#log(odds) of High lotsize for being owner
options(digits=4)
log(9/3) ##1.099
## [1] 1.099
#log(odds) of Low lotsize for being owner
log(3/9) ##-1.099
## [1] -1.099
##difference in log(odds) between low and high lotsize
log(3/9)-log(9/3) #-2.197
## [1] -2.197
reg4 <- glm(actual ~ lotsize, data = df, family = "binomial")
summary(reg4)
##
## Call:
## glm(formula = actual ~ lotsize, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.665 -0.758 0.000 0.758 1.665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.099 0.667 1.65 0.099 .
## lotsizeLow -2.197 0.943 -2.33 0.020 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33.271 on 23 degrees of freedom
## Residual deviance: 26.992 on 22 degrees of freedom
## AIC: 30.99
##
## Number of Fisher Scoring iterations: 4
##regression coeffs show the same results as above
psuedoR2 <- 1-reg4$deviance/reg4$null.deviance
psuedoR2
## [1] 0.1887