getwd()
## [1] "/cloud/project"
# make sure the packages for this chapter
# are installed, install if necessary
pkg <- c("ggplot2", "scales", "maptools",
"sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
## Warning: package 'maptools' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
# read the CSV with headers
regression1<-read.csv("incidents.csv", header=T,sep =",")
regression1
## area zone population incidents
## 1 Boulder west 107,353 605
## 2 California-lexington east 326,534 103
## 3 Huntsville east 444,752 161
## 4 Seattle west 750,000 1703
## 5 San Jose west 64,403 1003
## 6 Miami east 2,744,878 527
## 7 Phoenix west 1,600,000 721
## 8 Houston west 2,333,000 704
## 9 Philadelphia east 1,572,816 105
## 10 Washington east 712,091 403
## 11 Dallas-Fort Worth west 6,900,000 803
## 12 Chicago east 2,700,000 302
## 13 Boston east 4,900,000 205
## 14 Los Angeles west 4,200,000 1003
## 15 New York east 5,200,000 703
## 16 San Francisco west 7,100,000 2072
summary(regression1)
## area zone population incidents
## Length:16 Length:16 Length:16 Min. : 103.0
## Class :character Class :character Class :character 1st Qu.: 277.8
## Mode :character Mode :character Mode :character Median : 654.0
## Mean : 695.2
## 3rd Qu.: 853.0
## Max. :2072.0
str(regression1)
## 'data.frame': 16 obs. of 4 variables:
## $ area : chr "Boulder" "California-lexington" "Huntsville" "Seattle" ...
## $ zone : chr "west" "east" "east" "west" ...
## $ population: chr "107,353" "326,534" "444,752" "750,000" ...
## $ incidents : int 605 103 161 1703 1003 527 721 704 105 403 ...
regression1$population <- as.numeric(gsub(",","",regression1$population))
regression1$population
## [1] 107353 326534 444752 750000 64403 2744878 1600000 2333000 1572816
## [10] 712091 6900000 2700000 4900000 4200000 5200000 7100000
str(regression1$population)
## num [1:16] 107353 326534 444752 750000 64403 ...
regression2<-regression1[,-1]#new data frame with the deletion of column 1
head(regression2)
## zone population incidents
## 1 west 107353 605
## 2 east 326534 103
## 3 east 444752 161
## 4 west 750000 1703
## 5 west 64403 1003
## 6 east 2744878 527
reg.fit1<-lm(regression1$incidents ~ regression1$population)
summary(reg.fit1)
##
## Call:
## lm(formula = regression1$incidents ~ regression1$population)
##
## Residuals:
## Min 1Q Median 3Q Max
## -684.5 -363.5 -156.2 133.9 1164.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.749e+02 2.018e+02 2.353 0.0337 *
## regression1$population 8.462e-05 5.804e-05 1.458 0.1669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 534.9 on 14 degrees of freedom
## Multiple R-squared: 0.1318, Adjusted R-squared: 0.0698
## F-statistic: 2.126 on 1 and 14 DF, p-value: 0.1669
##Is Population significant at 5%? NO ##The slope’s p‑value is 0.167, which is greater than 0.05, so Population is not statistically significant at the 5% level.
##Adjusted R² of the model: ##0.070 (≈ 0.0698).
reg.fit2<-lm(incidents ~ zone+population, data = regression1)
summary(reg.fit2)
##
## Call:
## lm(formula = incidents ~ zone + population, data = regression1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -537.21 -273.14 -57.89 188.17 766.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.612e+02 1.675e+02 0.962 0.35363
## zonewest 7.266e+02 1.938e+02 3.749 0.00243 **
## population 6.557e-05 4.206e-05 1.559 0.14300
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 384.8 on 13 degrees of freedom
## Multiple R-squared: 0.5828, Adjusted R-squared: 0.5186
## F-statistic: 9.081 on 2 and 13 DF, p-value: 0.003404
regression1$zone <- ifelse(regression1$zone == "west", 1, 0)#Please explain the syntax and the output
#View(regression1)
str(regression1)
## 'data.frame': 16 obs. of 4 variables:
## $ area : chr "Boulder" "California-lexington" "Huntsville" "Seattle" ...
## $ zone : num 1 0 0 1 1 0 1 1 0 0 ...
## $ population: num 107353 326534 444752 750000 64403 ...
## $ incidents : int 605 103 161 1703 1003 527 721 704 105 403 ...
#regression1$zone<-as.integer((regression1$zone),replace=TRUE) was not necessary
interaction<-regression1$zone*regression1$population#Explain the syntax
reg.fit3<-lm(regression1$incidents~interaction+regression1$population+regression1$zone)
reg.fit3
##
## Call:
## lm(formula = regression1$incidents ~ interaction + regression1$population +
## regression1$zone)
##
## Coefficients:
## (Intercept) interaction regression1$population
## 1.659e+02 2.974e-06 6.351e-05
## regression1$zone
## 7.192e+02
summary(reg.fit3)
##
## Call:
## lm(formula = regression1$incidents ~ interaction + regression1$population +
## regression1$zone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -540.91 -270.93 -59.56 187.99 767.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.659e+02 2.313e+02 0.717 0.4869
## interaction 2.974e-06 9.469e-05 0.031 0.9755
## regression1$population 6.352e-05 7.868e-05 0.807 0.4352
## regression1$zone 7.192e+02 3.108e+02 2.314 0.0392 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 400.5 on 12 degrees of freedom
## Multiple R-squared: 0.5829, Adjusted R-squared: 0.4786
## F-statistic: 5.589 on 3 and 12 DF, p-value: 0.01237
##Answers (5% significance level): ##Population: Not significant (p = 0.435 > 0.05 ##Zone (West vs. East): Significant (p = 0.039 < 0.05) ##Interaction (Population × Zone[West]): Not significant (p = 0.975 > 0.05) ##Adjusted R²: 0.479 (≈ 0.4786)
reg.fit4<-lm(regression1$incidents~interaction)
reg.fit4
##
## Call:
## lm(formula = regression1$incidents ~ interaction)
##
## Coefficients:
## (Intercept) interaction
## 4.951e+02 1.389e-04
summary(reg.fit4)
##
## Call:
## lm(formula = regression1$incidents ~ interaction)
##
## Residuals:
## Min 1Q Median 3Q Max
## -650.28 -301.09 -83.71 123.23 1103.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.951e+02 1.320e+02 3.751 0.00215 **
## interaction 1.389e-04 4.737e-05 2.932 0.01093 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 451.9 on 14 degrees of freedom
## Multiple R-squared: 0.3804, Adjusted R-squared: 0.3361
## F-statistic: 8.595 on 1 and 14 DF, p-value: 0.01093
# Running the Poisson GLM
model_glm <- glm(incidents ~ zone + offset(log(population)),
data = regression2,
family = poisson(link = "log"))
# Check the results
summary(model_glm)
##
## Call:
## glm(formula = incidents ~ zone + offset(log(population)), family = poisson(link = "log"),
## data = regression2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.91109 0.01996 -446.36 <2e-16 ***
## zonewest 1.01885 0.02269 44.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16731 on 15 degrees of freedom
## Residual deviance: 14370 on 14 degrees of freedom
## AIC: 14503
##
## Number of Fisher Scoring iterations: 6
model_simple_glm <- glm(incidents ~ zone, data = regression2, family = poisson)
summary(model_simple_glm)
##
## Call:
## glm(formula = incidents ~ zone, family = poisson, data = regression2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.74820 0.01996 287.93 <2e-16 ***
## zonewest 1.23350 0.02269 54.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6200.4 on 15 degrees of freedom
## Residual deviance: 2657.0 on 14 degrees of freedom
## AIC: 2789.8
##
## Number of Fisher Scoring iterations: 5
model_int_glm <- glm(incidents ~ zone * population + offset(log(population)),
data = regression2,
family = poisson)
summary(model_int_glm)
##
## Call:
## glm(formula = incidents ~ zone * population + offset(log(population)),
## family = poisson, data = regression2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.991e+00 4.081e-02 -195.80 < 2e-16 ***
## zonewest 1.739e+00 4.502e-02 38.64 < 2e-16 ***
## population -2.742e-07 1.194e-08 -22.96 < 2e-16 ***
## zonewest:population -1.007e-07 1.283e-08 -7.85 4.15e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16730.6 on 15 degrees of freedom
## Residual deviance: 7077.5 on 12 degrees of freedom
## AIC: 7214.2
##
## Number of Fisher Scoring iterations: 5
# Calculate McFadden's Pseudo-R2
pseudo_r2 <- 1 - (model_int_glm$deviance / model_int_glm$null.deviance)
print(pseudo_r2)
## [1] 0.5769745