General Assembly Data Science Class, Assignment 2 - Linear Regression

Brian Abelson

2012-10-18

Setup

setwd("/Users/brian/Dropbox/GitRepository/data_science_class/class/hw2")
data <- read.csv("districts.csv", stringsAsFactors = F)

# munge headers
names(data) <- tolower(names(data))
names(data) <- gsub("\\.", "_", names(data))

# munge school district
data$schooldist <- gsub("CSD ([0-9]+) ([A-Za-z]+)", "\\1", data$jurisdiction_name)
data$schooldist <- as.numeric(data$schooldist)

# extract boro
data$boro <- gsub("CSD ([0-9]+) ([A-Za-z]+)", "\\2", data$jurisdiction_name)
data$boro <- as.numeric(as.factor(data$boro))

# write munged data to file
write.csv(data, "districts_munged.csv", row.names = F)

Create function to run a bivariate regression, plot a scatter plot with line of best fit, and report value of R squared and Adj R Squared

linear_regression <- function(x, y) {
    model <- lm(y ~ x)
    output <- summary(model)
    plot(x, y, col = "darkblue", pch = 20)
    abline(model, col = "darkred", lwd = 2)
    return(output)
}

Run function on six pairs of variables

x <- data$percent_male
y <- data$percent_black_non_hispanic
linear_regression(x, y)

plot of chunk unnamed-chunk-3

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3870 -0.1484 -0.0577  0.1316  0.5564 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.0577     0.0773    0.75   0.4609   
## x             0.5144     0.1860    2.77   0.0096 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.25 on 30 degrees of freedom
## Multiple R-squared: 0.203,   Adjusted R-squared: 0.177 
## F-statistic: 7.65 on 1 and 30 DF,  p-value: 0.00962
x <- data$percent_asian_non_hispanic
y <- data$percent_black_non_hispanic
linear_regression(x, y)

plot of chunk unnamed-chunk-4

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.252 -0.233 -0.124  0.177  0.748 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.2524     0.0529    4.77  4.4e-05 ***
## x            -0.2396     0.2530   -0.95     0.35    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.276 on 30 degrees of freedom
## Multiple R-squared: 0.029,   Adjusted R-squared: -0.00332 
## F-statistic: 0.897 on 1 and 30 DF,  p-value: 0.351
x <- data$percent_white_non_hispanic
y <- data$percent_black_non_hispanic
linear_regression(x, y)

plot of chunk unnamed-chunk-5

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3186 -0.1994  0.0148  0.1140  0.6814 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.3186     0.0557    5.72  3.1e-06 ***
## x            -0.3334     0.1291   -2.58    0.015 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.253 on 30 degrees of freedom
## Multiple R-squared: 0.182,   Adjusted R-squared: 0.155 
## F-statistic: 6.67 on 1 and 30 DF,  p-value: 0.015
x <- data$percent_female
y <- data$percent_white_non_hispanic
linear_regression(x, y)

plot of chunk unnamed-chunk-6

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3760 -0.2712 -0.0566  0.2653  0.7651 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.00937    0.11545   -0.08    0.936  
## x            0.52794    0.19952    2.65    0.013 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.323 on 30 degrees of freedom
## Multiple R-squared: 0.189,   Adjusted R-squared: 0.162 
## F-statistic:    7 on 1 and 30 DF,  p-value: 0.0128
x <- data$percent_hispanic_latino
y <- data$percent_us_citizen
linear_regression(x, y)

plot of chunk unnamed-chunk-7

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8243  0.0217  0.1375  0.2571  0.3056 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.6944     0.0892    7.79  1.1e-08 ***
## x             0.2599     0.2750    0.95     0.35    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.379 on 30 degrees of freedom
## Multiple R-squared: 0.0289,  Adjusted R-squared: -0.00345 
## F-statistic: 0.893 on 1 and 30 DF,  p-value: 0.352
x <- data$percent_hispanic_latino
y <- data$percent_permanent_resident_alien
linear_regression(x, y)

plot of chunk unnamed-chunk-8

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1693 -0.0775 -0.0278 -0.0066  0.8218 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   0.0278     0.0427    0.65    0.519  
## x             0.3007     0.1316    2.29    0.029 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.181 on 30 degrees of freedom
## Multiple R-squared: 0.148,   Adjusted R-squared: 0.12 
## F-statistic: 5.23 on 1 and 30 DF,  p-value: 0.0295