pkg <- c("ggplot2", "scales", "maptools",
"sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
# read the CSV with headers
reg1<-read.csv("incidents.csv", header=T,sep =",")
reg1
summary(reg1)
area zone population incidents
Length:16 Length:16 Length:16 Min. : 103.0
Class :character Class :character Class :character 1st Qu.: 277.8
Mode :character Mode :character Mode :character Median : 654.0
Mean : 695.2
3rd Qu.: 853.0
Max. :2072.0
str(reg1)
'data.frame': 16 obs. of 4 variables:
$ area : chr "Boulder" "California-lexington" "Huntsville" "Seattle" ...
$ zone : chr "west" "east" "east" "west" ...
$ population: chr "107,353" "326,534" "444,752" "750,000" ...
$ incidents : int 605 103 161 1703 1003 527 721 704 105 403 ...
reg1$population <- as.numeric(gsub(",","",reg1$population))
reg1$population
[1] 107353 326534 444752 750000 64403 2744878 1600000 2333000 1572816 712091 6900000 2700000 4900000 4200000 5200000 7100000
str(reg1$population)
num [1:16] 107353 326534 444752 750000 64403 ...
reg2<-reg1[,-1]
head(reg2)
reg.fit1<-lm(reg2$incidents ~ reg1$population)
summary(reg.fit1)
Call:
lm(formula = reg2$incidents ~ reg1$population)
Residuals:
Min 1Q Median 3Q Max
-684.5 -363.5 -156.2 133.9 1164.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.749e+02 2.018e+02 2.353 0.0337 *
reg1$population 8.462e-05 5.804e-05 1.458 0.1669
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 534.9 on 14 degrees of freedom
Multiple R-squared: 0.1318, Adjusted R-squared: 0.0698
F-statistic: 2.126 on 1 and 14 DF, p-value: 0.1669
Based on the output obtained above, please answer the following question:
Is Population significant at a 5% significance level? What is the adjusted-R squared of the model?
No, because the p value is greater than 0.05 WE FAIL TO REECT THE NULL The Adjusted R-squared: 0.0698
reg.fit2<-lm(incidents ~ zone+population, data = reg2)
summary(reg.fit2)
Call:
lm(formula = incidents ~ zone + population, data = reg2)
Residuals:
Min 1Q Median 3Q Max
-537.21 -273.14 -57.89 188.17 766.03
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.612e+02 1.675e+02 0.962 0.35363
zonewest 7.266e+02 1.938e+02 3.749 0.00243 **
population 6.557e-05 4.206e-05 1.559 0.14300
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 384.8 on 13 degrees of freedom
Multiple R-squared: 0.5828, Adjusted R-squared: 0.5186
F-statistic: 9.081 on 2 and 13 DF, p-value: 0.003404
Based on the output obtained above, please answer the following question:
reg2$zone <- ifelse(reg2$zone == "west", 1, 0)#Please explain the syntax and the output
reg2
str(reg2)
'data.frame': 16 obs. of 3 variables:
$ zone : num 1 0 0 1 1 0 1 1 0 0 ...
$ population: num 107353 326534 444752 750000 64403 ...
$ incidents : int 605 103 161 1703 1003 527 721 704 105 403 ...
interaction<-reg2$zone*reg2$population#Explain the syntax
reg.fit3<-lm(reg2$incidents~interaction+reg2$population+reg2$zone)
summary(reg.fit3)
Call:
lm(formula = reg2$incidents ~ interaction + reg2$population +
reg2$zone)
Residuals:
Min 1Q Median 3Q Max
-540.91 -270.93 -59.56 187.99 767.99
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.659e+02 2.313e+02 0.717 0.4869
interaction 2.974e-06 9.469e-05 0.031 0.9755
reg2$population 6.352e-05 7.868e-05 0.807 0.4352
reg2$zone 7.192e+02 3.108e+02 2.314 0.0392 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 400.5 on 12 degrees of freedom
Multiple R-squared: 0.5829, Adjusted R-squared: 0.4786
F-statistic: 5.589 on 3 and 12 DF, p-value: 0.01237
Based on the output obtained above, please answer the following question:
Is Population significant at a 5% significance level?
since the p-value is 0.4352 No, because the p value is greater than 0.05
WE FAIL TO REJECT THE NULL
Is Zone significant at a 5% significance level? Yes, because the p value is less than 0.05 WE REJECT THE NULL
Is the interaction term significant at a 5% significance level? What is the adjusted-R squared of the model? No, because the p value is greater than 0.05 WE FAIL TO REJECT THE NULL
Let us now run a model where the only feature is the interaction term.
Is the interaction term significant at a 5% significance level? What is the adjusted-R squared of the model?
reg.fit4<-lm(reg2$incidents~interaction)
summary(reg.fit4)
Call:
lm(formula = reg2$incidents ~ interaction)
Residuals:
Min 1Q Median 3Q Max
-650.28 -301.09 -83.71 123.23 1103.76
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.951e+02 1.320e+02 3.751 0.00215 **
interaction 1.389e-04 4.737e-05 2.932 0.01093 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 451.9 on 14 degrees of freedom
Multiple R-squared: 0.3804, Adjusted R-squared: 0.3361
F-statistic: 8.595 on 1 and 14 DF, p-value: 0.01093
Which of the models run above would you choose to make predictions? Why??
The model i choose would be incidents ~ zone+population since it has the highest R^2 squared.
reg.fit5<-lm(reg2$incidents~ reg2$zone)
summary(reg.fit5)
Call:
lm(formula = reg2$incidents ~ reg2$zone)
Residuals:
Min 1Q Median 3Q Max
-471.75 -226.41 -91.19 120.37 995.25
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 313.6 142.8 2.196 0.04546 *
reg2$zone 763.1 202.0 3.778 0.00204 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 404 on 14 degrees of freedom
Multiple R-squared: 0.5048, Adjusted R-squared: 0.4695
F-statistic: 14.27 on 1 and 14 DF, p-value: 0.002037