getwd()
## [1] "/cloud/project"

Install necessary packages

pkg <- c("ggplot2", "scales", "maptools",
              "sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
  install.packages(new.pkg)  
}
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
## Warning: package 'maptools' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

read the CSV with headers

regression1<-read.csv("incidents.csv", header=T,sep =",")
#View(regression1)
summary(regression1)
##      area               zone            population          incidents     
##  Length:16          Length:16          Length:16          Min.   : 103.0  
##  Class :character   Class :character   Class :character   1st Qu.: 277.8  
##  Mode  :character   Mode  :character   Mode  :character   Median : 654.0  
##                                                           Mean   : 695.2  
##                                                           3rd Qu.: 853.0  
##                                                           Max.   :2072.0
str(regression1)
## 'data.frame':    16 obs. of  4 variables:
##  $ area      : chr  "Boulder" "California-lexington" "Huntsville" "Seattle" ...
##  $ zone      : chr  "west" "east" "east" "west" ...
##  $ population: chr  "107,353" "326,534" "444,752" "750,000" ...
##  $ incidents : int  605 103 161 1703 1003 527 721 704 105 403 ...

Format population column to numeric

regression1$population <- as.numeric(gsub(",","",regression1$population))
regression1$population
##  [1]  107353  326534  444752  750000   64403 2744878 1600000 2333000 1572816
## [10]  712091 6900000 2700000 4900000 4200000 5200000 7100000
str(regression1$population)
##  num [1:16] 107353 326534 444752 750000 64403 ...

Create new data frame; delete column 1

regression2<-regression1[,-1]
head(regression2)
reg.fit1<-lm(regression1$incidents ~ regression1$population)
summary(reg.fit1)
## 
## Call:
## lm(formula = regression1$incidents ~ regression1$population)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -684.5 -363.5 -156.2  133.9 1164.7 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            4.749e+02  2.018e+02   2.353   0.0337 *
## regression1$population 8.462e-05  5.804e-05   1.458   0.1669  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 534.9 on 14 degrees of freedom
## Multiple R-squared:  0.1318, Adjusted R-squared:  0.0698 
## F-statistic: 2.126 on 1 and 14 DF,  p-value: 0.1669

Based on the output obtained above, please answer the following question:

Is Population significant at a 5% significance level? What is the adjusted-R squared of the model?

P-value for Population: 0.1669. Threshold is 0.05. Since 0.1669 > 0.05, we fail to reject the null hypothesis. This means there is not enough statistical evidence to say that population has a significant effect on the number of incidents in this specific model.

Adjusted R-squared: 0.0698. The model has very low explanatory power.

reg.fit2<-lm(incidents ~ zone+population, data = regression1)
summary(reg.fit2)
## 
## Call:
## lm(formula = incidents ~ zone + population, data = regression1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -537.21 -273.14  -57.89  188.17  766.03 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1.612e+02  1.675e+02   0.962  0.35363   
## zonewest    7.266e+02  1.938e+02   3.749  0.00243 **
## population  6.557e-05  4.206e-05   1.559  0.14300   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 384.8 on 13 degrees of freedom
## Multiple R-squared:  0.5828, Adjusted R-squared:  0.5186 
## F-statistic: 9.081 on 2 and 13 DF,  p-value: 0.003404

Based on the output obtained above, please answer the following question:

Are Population and/or Zone significant at a 5% significance level? What is the adjusted-R squared of the model?

Zone is significant at a 5% significance level, its p-value is 0.00243 which is less than 0.05.

Population is still not significant (0.14300 > 0.05)

Adjusted R-squared: 0.5186. It is much better than the previous one. This means that appr. 52% of the variation in the dependent variable is explained by the independent variables in this model. This model has moderate explanatory power

regression1$zone <- ifelse(regression1$zone == "west", 1, 0)

The code above, transforms a categorical variable (text-based) zone into a dummy variable (numeric 0 or 1).

It checks every single row in the zone column to see if the value is exactly “west”. If it is, number 1 is assigned

#View(regression1)
str(regression1)
## 'data.frame':    16 obs. of  4 variables:
##  $ area      : chr  "Boulder" "California-lexington" "Huntsville" "Seattle" ...
##  $ zone      : num  1 0 0 1 1 0 1 1 0 0 ...
##  $ population: num  107353 326534 444752 750000 64403 ...
##  $ incidents : int  605 103 161 1703 1003 527 721 704 105 403 ...
#regression1$zone<-as.integer((regression1$zone),replace=TRUE) was not necessary

Create a new variable interaction. This interaction term allows to see if the effect of one variable (population) changes depending on the value of another variable (zone).

Statistically, it may answer the question “Does the impact of population on the dependent variable change if the location is in the West?”

The population coefficient will tell the effect of population for the non-west zones (where the interaction is 0).

The interaction coefficient will indicate the difference in the effect of population in the west zone.

interaction<-regression1$zone*regression1$population
reg.fit3<-lm(regression1$incidents~interaction+regression1$population+regression1$zone)
summary(reg.fit3)
## 
## Call:
## lm(formula = regression1$incidents ~ interaction + regression1$population + 
##     regression1$zone)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -540.91 -270.93  -59.56  187.99  767.99 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            1.659e+02  2.313e+02   0.717   0.4869  
## interaction            2.974e-06  9.469e-05   0.031   0.9755  
## regression1$population 6.352e-05  7.868e-05   0.807   0.4352  
## regression1$zone       7.192e+02  3.108e+02   2.314   0.0392 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 400.5 on 12 degrees of freedom
## Multiple R-squared:  0.5829, Adjusted R-squared:  0.4786 
## F-statistic: 5.589 on 3 and 12 DF,  p-value: 0.01237

Based on the output obtained above, please answer the following question:

Is Population significant at a 5% significance level? Is Zone significant at a 5% significance level? Is the interaction term significant at a 5% significance level? What is the adjusted-R squared of the model?

Population is not significant at a 5% significance level (0.4352 > 0.05)

Zone is significant at a 5% significance level (0.0392 < 0.05)

Interaction is not significant at a 5% significance level (0.9755 > 0.05)

The adjusted-R squared of the model is 0.4786 (~48%)

Let us now run a model where the only feature is the interaction term.

Is the interaction term significant at a 5% significance level? What is the adjusted-R squared of the model?

reg.fit4<-lm(regression1$incidents~interaction)
summary(reg.fit4)
## 
## Call:
## lm(formula = regression1$incidents ~ interaction)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -650.28 -301.09  -83.71  123.23 1103.76 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 4.951e+02  1.320e+02   3.751  0.00215 **
## interaction 1.389e-04  4.737e-05   2.932  0.01093 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 451.9 on 14 degrees of freedom
## Multiple R-squared:  0.3804, Adjusted R-squared:  0.3361 
## F-statistic: 8.595 on 1 and 14 DF,  p-value: 0.01093

Interaction term is significant at a 5% significance level (0.01093 < 0.05)

The adjusted-R squared of the model is 0.3361 (~34%)

Which of the models run above would you choose to make predictions? Why??

I would choose model 2 (zone + population)

Model 2 explains about 52% of the variance in incidents, which is the highest among all four options.

The zone variable is highly significant (p = 0.00243), suggesting that being in the “West” zone has a very strong, measurable impact on the number of incidents.

Model 2 has the lowest error (384.8)