getwd()
## [1] "/cloud/project"
# read the CSV with headers
regression1<-read.csv("incidents.csv",header=T,sep = ",")
str(regression1)
## 'data.frame': 16 obs. of 4 variables:
## $ area : chr "Boulder" "California-lexington" "Huntsville" "Seattle" ...
## $ zone : chr "west" "east" "east" "west" ...
## $ population: chr "107,353" "326,534" "444,752" "750,000" ...
## $ incidents : int 605 103 161 1703 1003 527 721 704 105 403 ...
summary(regression1)
## area zone population incidents
## Length:16 Length:16 Length:16 Min. : 103.0
## Class :character Class :character Class :character 1st Qu.: 277.8
## Mode :character Mode :character Mode :character Median : 654.0
## Mean : 695.2
## 3rd Qu.: 853.0
## Max. :2072.0
# make sure the packages for this chapter
# are installed, install if necessary
pkg <- c("ggplot2", "scales", "maptools",
"sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
## Warning: package 'maptools' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
regression1$population <- as.numeric(gsub(",","",regression1$population))
regression1$population
## [1] 107353 326534 444752 750000 64403 2744878 1600000 2333000 1572816
## [10] 712091 6900000 2700000 4900000 4200000 5200000 7100000
str(regression1$population)
## num [1:16] 107353 326534 444752 750000 64403 ...
regression2<-regression1[,-1]#new data frame with the deletion of column 1
head(regression2)
reg.fit1<-lm(regression2$incidents~regression2$population)
summary(reg.fit1)
##
## Call:
## lm(formula = regression2$incidents ~ regression2$population)
##
## Residuals:
## Min 1Q Median 3Q Max
## -684.5 -363.5 -156.2 133.9 1164.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.749e+02 2.018e+02 2.353 0.0337 *
## regression2$population 8.462e-05 5.804e-05 1.458 0.1669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 534.9 on 14 degrees of freedom
## Multiple R-squared: 0.1318, Adjusted R-squared: 0.0698
## F-statistic: 2.126 on 1 and 14 DF, p-value: 0.1669
reg.fit2<-lm(incidents~zone+population,data=regression2)
summary(reg.fit2)
##
## Call:
## lm(formula = incidents ~ zone + population, data = regression2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -537.21 -273.14 -57.89 188.17 766.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.612e+02 1.675e+02 0.962 0.35363
## zonewest 7.266e+02 1.938e+02 3.749 0.00243 **
## population 6.557e-05 4.206e-05 1.559 0.14300
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 384.8 on 13 degrees of freedom
## Multiple R-squared: 0.5828, Adjusted R-squared: 0.5186
## F-statistic: 9.081 on 2 and 13 DF, p-value: 0.003404
regression2$zone<-ifelse(regression2$zone=="west",1,0)
interaction<-regression2$zone*regression2$population
reg.fit3<-lm(regression2$incidents~interaction+regression2$population+regression2$zone)
summary(reg.fit3)
##
## Call:
## lm(formula = regression2$incidents ~ interaction + regression2$population +
## regression2$zone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -540.91 -270.93 -59.56 187.99 767.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.659e+02 2.313e+02 0.717 0.4869
## interaction 2.974e-06 9.469e-05 0.031 0.9755
## regression2$population 6.352e-05 7.868e-05 0.807 0.4352
## regression2$zone 7.192e+02 3.108e+02 2.314 0.0392 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 400.5 on 12 degrees of freedom
## Multiple R-squared: 0.5829, Adjusted R-squared: 0.4786
## F-statistic: 5.589 on 3 and 12 DF, p-value: 0.01237
reg.fit4<-lm(regression2$incidents~interaction)
summary(reg.fit4)
##
## Call:
## lm(formula = regression2$incidents ~ interaction)
##
## Residuals:
## Min 1Q Median 3Q Max
## -650.28 -301.09 -83.71 123.23 1103.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.951e+02 1.320e+02 3.751 0.00215 **
## interaction 1.389e-04 4.737e-05 2.932 0.01093 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 451.9 on 14 degrees of freedom
## Multiple R-squared: 0.3804, Adjusted R-squared: 0.3361
## F-statistic: 8.595 on 1 and 14 DF, p-value: 0.01093
# Running the Poisson GLM
model_glm <- glm(incidents ~ zone + offset(log(population)),
data = regression2,
family = poisson(link = "log"))
# Check the results
summary(model_glm)
##
## Call:
## glm(formula = incidents ~ zone + offset(log(population)), family = poisson(link = "log"),
## data = regression2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.91109 0.01996 -446.36 <2e-16 ***
## zone 1.01885 0.02269 44.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16731 on 15 degrees of freedom
## Residual deviance: 14370 on 14 degrees of freedom
## AIC: 14503
##
## Number of Fisher Scoring iterations: 6
model_simple_glm <- glm(incidents ~ zone, data = regression2, family = poisson)
summary(model_simple_glm)
##
## Call:
## glm(formula = incidents ~ zone, family = poisson, data = regression2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.74820 0.01996 287.93 <2e-16 ***
## zone 1.23350 0.02269 54.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6200.4 on 15 degrees of freedom
## Residual deviance: 2657.0 on 14 degrees of freedom
## AIC: 2789.8
##
## Number of Fisher Scoring iterations: 5
model_int_glm <- glm(incidents ~ zone * population + offset(log(population)),
data = regression2,
family = poisson)
summary(model_int_glm)
##
## Call:
## glm(formula = incidents ~ zone * population + offset(log(population)),
## family = poisson, data = regression2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.991e+00 4.081e-02 -195.80 < 2e-16 ***
## zone 1.739e+00 4.502e-02 38.64 < 2e-16 ***
## population -2.742e-07 1.194e-08 -22.96 < 2e-16 ***
## zone:population -1.007e-07 1.283e-08 -7.85 4.15e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16730.6 on 15 degrees of freedom
## Residual deviance: 7077.5 on 12 degrees of freedom
## AIC: 7214.2
##
## Number of Fisher Scoring iterations: 5
# Calculate McFadden's Pseudo-R2
pseudo_r2 <- 1 - (model_int_glm$deviance / model_int_glm$null.deviance)
print(pseudo_r2)
## [1] 0.5769745
Model 1: reg.fit1 (incidents ~ population)
Is Population significant at 5%? No. The p-value for population is 0.1669, which is greater than 0.05. Therefore, population is not statistically significant at the 5% level.
What is the adjusted R-squared? The adjusted R² is 0.0698 (about 7%).
This means the model explains only about 7% of the variability in incidents, which is very low. So population alone does not explain incidents well in this dataset.
Model 2: reg.fit2 (incidents ~ zone + population)
Is Population significant at 5%? No. The p-value for population is 0.1430, which is greater than 0.05. So population is not statistically significant at the 5% level.
Is Zone significant at 5%? Yes. The p-value for zone is 0.00243, which is less than 0.05. Therefore, zone is statistically significant.
What is the adjusted R-squared? The adjusted R² is 0.5186 (about 52%).
This means the model explains about 52% of the variability in incidents after adjusting for the number of predictors, which is a substantial improvement compared to Model 1.
Model with interaction (reg.fit3)
Is Population significant at 5%? No. The p-value for population is 0.4352, which is greater than 0.05. Therefore, population is not statistically significant at the 5% level.
Is Zone significant at 5%? Yes. The p-value for zone is 0.0392, which is less than 0.05. Therefore, zone is statistically significant.
Is the interaction term significant at 5%? No. The p-value for the interaction term is 0.9755, which is much greater than 0.05. So, the interaction is not significant.
What is the adjusted R-squared of this model? The adjusted R² is 0.4786 (about 47.86%). This means the model explains about 48% of the variability in incidents after adjusting for the number of predictors.
Model with only the interaction term (reg.fit4)
Is the interaction term significant at 5%? Yes. The p-value is 0.01093, which is less than 0.05. So, the interaction term is significant in this model.
What is the adjusted R-squared? The adjusted R² is 0.3361 (about 33.61%). This model explains about 34% of the variability in incidents.
Among all four linear regression models (fit1–fit4), reg.fit2 (incidents ~ zone + population) is the best choice for prediction for the following reasons:
fit2 has the highest adjusted R² (52%), meaning it explains the largest proportion of variability in incidents after accounting for the number of predictors. This makes it the strongest model in terms of explanatory power.
Zone is highly significant (p = 0.00243 < 0.05).
Population is not statistically significant at 5%, but it still contributes to improving the overall model fit.