getwd()
## [1] "/cloud/project"
# read the CSV with headers
regression1<-read.csv("incidents.csv",header=T,sep = ",")
str(regression1)
## 'data.frame':    16 obs. of  4 variables:
##  $ area      : chr  "Boulder" "California-lexington" "Huntsville" "Seattle" ...
##  $ zone      : chr  "west" "east" "east" "west" ...
##  $ population: chr  "107,353" "326,534" "444,752" "750,000" ...
##  $ incidents : int  605 103 161 1703 1003 527 721 704 105 403 ...
summary(regression1)
##      area               zone            population          incidents     
##  Length:16          Length:16          Length:16          Min.   : 103.0  
##  Class :character   Class :character   Class :character   1st Qu.: 277.8  
##  Mode  :character   Mode  :character   Mode  :character   Median : 654.0  
##                                                           Mean   : 695.2  
##                                                           3rd Qu.: 853.0  
##                                                           Max.   :2072.0
# make sure the packages for this chapter
# are installed, install if necessary
pkg <- c("ggplot2", "scales", "maptools",
              "sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
  install.packages(new.pkg)  
}
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
## Warning: package 'maptools' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
regression1$population <- as.numeric(gsub(",","",regression1$population))
regression1$population
##  [1]  107353  326534  444752  750000   64403 2744878 1600000 2333000 1572816
## [10]  712091 6900000 2700000 4900000 4200000 5200000 7100000
str(regression1$population)
##  num [1:16] 107353 326534 444752 750000 64403 ...
regression2<-regression1[,-1]#new data frame with the deletion of column 1 
head(regression2)
reg.fit1<-lm(regression2$incidents~regression2$population)
summary(reg.fit1)
## 
## Call:
## lm(formula = regression2$incidents ~ regression2$population)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -684.5 -363.5 -156.2  133.9 1164.7 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            4.749e+02  2.018e+02   2.353   0.0337 *
## regression2$population 8.462e-05  5.804e-05   1.458   0.1669  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 534.9 on 14 degrees of freedom
## Multiple R-squared:  0.1318, Adjusted R-squared:  0.0698 
## F-statistic: 2.126 on 1 and 14 DF,  p-value: 0.1669
reg.fit2<-lm(incidents~zone+population,data=regression2)
summary(reg.fit2)
## 
## Call:
## lm(formula = incidents ~ zone + population, data = regression2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -537.21 -273.14  -57.89  188.17  766.03 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1.612e+02  1.675e+02   0.962  0.35363   
## zonewest    7.266e+02  1.938e+02   3.749  0.00243 **
## population  6.557e-05  4.206e-05   1.559  0.14300   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 384.8 on 13 degrees of freedom
## Multiple R-squared:  0.5828, Adjusted R-squared:  0.5186 
## F-statistic: 9.081 on 2 and 13 DF,  p-value: 0.003404
regression2$zone<-ifelse(regression2$zone=="west",1,0)
interaction<-regression2$zone*regression2$population
reg.fit3<-lm(regression2$incidents~interaction+regression2$population+regression2$zone)
summary(reg.fit3)
## 
## Call:
## lm(formula = regression2$incidents ~ interaction + regression2$population + 
##     regression2$zone)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -540.91 -270.93  -59.56  187.99  767.99 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            1.659e+02  2.313e+02   0.717   0.4869  
## interaction            2.974e-06  9.469e-05   0.031   0.9755  
## regression2$population 6.352e-05  7.868e-05   0.807   0.4352  
## regression2$zone       7.192e+02  3.108e+02   2.314   0.0392 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 400.5 on 12 degrees of freedom
## Multiple R-squared:  0.5829, Adjusted R-squared:  0.4786 
## F-statistic: 5.589 on 3 and 12 DF,  p-value: 0.01237
reg.fit4<-lm(regression2$incidents~interaction)
summary(reg.fit4)
## 
## Call:
## lm(formula = regression2$incidents ~ interaction)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -650.28 -301.09  -83.71  123.23 1103.76 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 4.951e+02  1.320e+02   3.751  0.00215 **
## interaction 1.389e-04  4.737e-05   2.932  0.01093 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 451.9 on 14 degrees of freedom
## Multiple R-squared:  0.3804, Adjusted R-squared:  0.3361 
## F-statistic: 8.595 on 1 and 14 DF,  p-value: 0.01093
# Running the Poisson GLM
model_glm <- glm(incidents ~ zone + offset(log(population)), 
                 data = regression2, 
                 family = poisson(link = "log"))

# Check the results
summary(model_glm)
## 
## Call:
## glm(formula = incidents ~ zone + offset(log(population)), family = poisson(link = "log"), 
##     data = regression2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.91109    0.01996 -446.36   <2e-16 ***
## zone         1.01885    0.02269   44.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16731  on 15  degrees of freedom
## Residual deviance: 14370  on 14  degrees of freedom
## AIC: 14503
## 
## Number of Fisher Scoring iterations: 6
model_simple_glm <- glm(incidents ~ zone, data = regression2, family = poisson)
summary(model_simple_glm)
## 
## Call:
## glm(formula = incidents ~ zone, family = poisson, data = regression2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.74820    0.01996  287.93   <2e-16 ***
## zone         1.23350    0.02269   54.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6200.4  on 15  degrees of freedom
## Residual deviance: 2657.0  on 14  degrees of freedom
## AIC: 2789.8
## 
## Number of Fisher Scoring iterations: 5
model_int_glm <- glm(incidents ~ zone * population + offset(log(population)), 
                     data = regression2, 
                     family = poisson)
summary(model_int_glm)
## 
## Call:
## glm(formula = incidents ~ zone * population + offset(log(population)), 
##     family = poisson, data = regression2)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -7.991e+00  4.081e-02 -195.80  < 2e-16 ***
## zone             1.739e+00  4.502e-02   38.64  < 2e-16 ***
## population      -2.742e-07  1.194e-08  -22.96  < 2e-16 ***
## zone:population -1.007e-07  1.283e-08   -7.85 4.15e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16730.6  on 15  degrees of freedom
## Residual deviance:  7077.5  on 12  degrees of freedom
## AIC: 7214.2
## 
## Number of Fisher Scoring iterations: 5
# Calculate McFadden's Pseudo-R2
pseudo_r2 <- 1 - (model_int_glm$deviance / model_int_glm$null.deviance)
print(pseudo_r2)
## [1] 0.5769745

Model 1: reg.fit1 (incidents ~ population)

Is Population significant at 5%? No. The p-value for population is 0.1669, which is greater than 0.05. Therefore, population is not statistically significant at the 5% level.

What is the adjusted R-squared? The adjusted R² is 0.0698 (about 7%).

This means the model explains only about 7% of the variability in incidents, which is very low. So population alone does not explain incidents well in this dataset.

Model 2: reg.fit2 (incidents ~ zone + population)

Is Population significant at 5%? No. The p-value for population is 0.1430, which is greater than 0.05. So population is not statistically significant at the 5% level.

Is Zone significant at 5%? Yes. The p-value for zone is 0.00243, which is less than 0.05. Therefore, zone is statistically significant.

What is the adjusted R-squared? The adjusted R² is 0.5186 (about 52%).

This means the model explains about 52% of the variability in incidents after adjusting for the number of predictors, which is a substantial improvement compared to Model 1.

Model with interaction (reg.fit3)

Is Population significant at 5%? No. The p-value for population is 0.4352, which is greater than 0.05. Therefore, population is not statistically significant at the 5% level.

Is Zone significant at 5%? Yes. The p-value for zone is 0.0392, which is less than 0.05. Therefore, zone is statistically significant.

Is the interaction term significant at 5%? No. The p-value for the interaction term is 0.9755, which is much greater than 0.05. So, the interaction is not significant.

What is the adjusted R-squared of this model? The adjusted R² is 0.4786 (about 47.86%). This means the model explains about 48% of the variability in incidents after adjusting for the number of predictors.

Model with only the interaction term (reg.fit4)

Is the interaction term significant at 5%? Yes. The p-value is 0.01093, which is less than 0.05. So, the interaction term is significant in this model.

What is the adjusted R-squared? The adjusted R² is 0.3361 (about 33.61%). This model explains about 34% of the variability in incidents.

Among all four linear regression models (fit1–fit4), reg.fit2 (incidents ~ zone + population) is the best choice for prediction for the following reasons:

fit2 has the highest adjusted R² (52%), meaning it explains the largest proportion of variability in incidents after accounting for the number of predictors. This makes it the strongest model in terms of explanatory power.

Zone is highly significant (p = 0.00243 < 0.05).

Population is not statistically significant at 5%, but it still contributes to improving the overall model fit.