Setup
setwd("/Users/brian/Dropbox/GitRepository/data_science_class/class/hw2")
data <- read.csv("districts.csv", stringsAsFactors = F)
# munge headers
names(data) <- tolower(names(data))
names(data) <- gsub("\\.", "_", names(data))
# munge school district
data$schooldist <- gsub("CSD ([0-9]+) ([A-Za-z]+)", "\\1", data$jurisdiction_name)
data$schooldist <- as.numeric(data$schooldist)
# extract boro
data$boro <- gsub("CSD ([0-9]+) ([A-Za-z]+)", "\\2", data$jurisdiction_name)
data$boro <- as.numeric(as.factor(data$boro))
# write munged data to file
write.csv(data, "districts_munged.csv", row.names = F)
Create function to run a bivariate regression, plot a scatter plot with line of best fit, and report value of R squared and Adj R Squared
linear_regression <- function(x, y) {
model <- lm(y ~ x)
output <- summary(model)
plot(x, y, col = "darkblue", pch = 20)
abline(model, col = "darkred", lwd = 2)
return(output)
}
Run function on six pairs of variables
x <- data$percent_male
y <- data$percent_black_non_hispanic
linear_regression(x, y)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3870 -0.1484 -0.0577 0.1316 0.5564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0577 0.0773 0.75 0.4609
## x 0.5144 0.1860 2.77 0.0096 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.25 on 30 degrees of freedom
## Multiple R-squared: 0.203, Adjusted R-squared: 0.177
## F-statistic: 7.65 on 1 and 30 DF, p-value: 0.00962
x <- data$percent_asian_non_hispanic
y <- data$percent_black_non_hispanic
linear_regression(x, y)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.252 -0.233 -0.124 0.177 0.748
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2524 0.0529 4.77 4.4e-05 ***
## x -0.2396 0.2530 -0.95 0.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.276 on 30 degrees of freedom
## Multiple R-squared: 0.029, Adjusted R-squared: -0.00332
## F-statistic: 0.897 on 1 and 30 DF, p-value: 0.351
x <- data$percent_white_non_hispanic
y <- data$percent_black_non_hispanic
linear_regression(x, y)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3186 -0.1994 0.0148 0.1140 0.6814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3186 0.0557 5.72 3.1e-06 ***
## x -0.3334 0.1291 -2.58 0.015 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.253 on 30 degrees of freedom
## Multiple R-squared: 0.182, Adjusted R-squared: 0.155
## F-statistic: 6.67 on 1 and 30 DF, p-value: 0.015
x <- data$percent_female
y <- data$percent_white_non_hispanic
linear_regression(x, y)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3760 -0.2712 -0.0566 0.2653 0.7651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.00937 0.11545 -0.08 0.936
## x 0.52794 0.19952 2.65 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.323 on 30 degrees of freedom
## Multiple R-squared: 0.189, Adjusted R-squared: 0.162
## F-statistic: 7 on 1 and 30 DF, p-value: 0.0128
x <- data$percent_hispanic_latino
y <- data$percent_us_citizen
linear_regression(x, y)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8243 0.0217 0.1375 0.2571 0.3056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6944 0.0892 7.79 1.1e-08 ***
## x 0.2599 0.2750 0.95 0.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.379 on 30 degrees of freedom
## Multiple R-squared: 0.0289, Adjusted R-squared: -0.00345
## F-statistic: 0.893 on 1 and 30 DF, p-value: 0.352
x <- data$percent_hispanic_latino
y <- data$percent_permanent_resident_alien
linear_regression(x, y)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1693 -0.0775 -0.0278 -0.0066 0.8218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0278 0.0427 0.65 0.519
## x 0.3007 0.1316 2.29 0.029 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.181 on 30 degrees of freedom
## Multiple R-squared: 0.148, Adjusted R-squared: 0.12
## F-statistic: 5.23 on 1 and 30 DF, p-value: 0.0295