library(readxl)
district_data <- read_excel("~/Desktop/grad school/Research II/data selection/district.xls")
head(district_data)
## # A tibble: 6 × 137
## DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAYUGA … 001902 001 AND… 07 A 3 574 4.4 11.5
## 2 ELKHART… 001903 001 AND… 07 A 4 1150 4 11.8
## 3 FRANKST… 001904 001 AND… 07 A 3 808 8.5 11.3
## 4 NECHES … 001906 001 AND… 07 A 2 342 8.2 13.5
## 5 PALESTI… 001907 001 AND… 07 B 6 3360 25.1 42.9
## 6 WESTWOO… 001908 001 AND… 07 B 4 1332 19.7 26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## # DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## # DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## # DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## # DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## # DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## # DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …
# Define dependent and independent variables
dependent_variable <- "DPETALLC"
independent_variables <- c("DZCAMPUS", "DPETBLAP", "DPETHISP", "DPETWHIP")
# While these aren't necesssary the variables, I would look at, these are ones that will help me conceptialize whats happening in the LM.
# DPETALLC: Total Students, DZCAMPUS: Number of Schools (Campuses), DPETBLAP: Percentage of African American Students, DPETHISP: Percentage of Hispanic Students, DPETWHIP: Percentage of White Students
# Create the linear model
model <- lm(DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data=district_data)
summary(model)
##
## Call:
## lm(formula = DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP,
## data = district_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25041 -631 6 571 54253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11165.06 1633.61 6.835 1.30e-11 ***
## DZCAMPUS 708.28 5.67 124.921 < 2e-16 ***
## DPETBLAP -123.06 18.96 -6.491 1.25e-10 ***
## DPETHISP -125.61 16.58 -7.574 7.19e-14 ***
## DPETWHIP -125.94 17.49 -7.199 1.06e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3177 on 1202 degrees of freedom
## Multiple R-squared: 0.9355, Adjusted R-squared: 0.9353
## F-statistic: 4358 on 4 and 1202 DF, p-value: < 2.2e-16
coefficients(model)
## (Intercept) DZCAMPUS DPETBLAP DPETHISP DPETWHIP
## 11165.0577 708.2797 -123.0637 -125.6058 -125.9414
# test Linearity
plot(model, which=1)
# I would say that this shows there is not much linearity in the
data/model. The line deviates from linearity, especially towards the end
of the graph. I would definitely need to transform the data to help
improve the linearity.