This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(zipcode)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(viridis)
## Loading required package: viridisLite
library(ggthemes)
library(readxl)
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
##
## ozone
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
# Import CT School and real estate price Data
CT_data <- read_excel("C:/Users/yul13011/Desktop/Data Incubator Challenge/Challenge/CT school realestate.xls")
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
CT_data=cbind(CT_data, zip=paste0("0",CT_data$lzip))
CT_primary=CT_data[CT_data$level==1,] # Primary school
CT_middle=CT_data[CT_data$level==2,] # Middle school
CT_high=CT_data[CT_data$level==3,] # High school
### Primary school analysis ###
sum=CT_primary %>%
group_by(zip)%>%
summarize_at(vars(math_prof_1617),funs(mean(.,na.rm=T)))
colnames(sum)=c("zip","math")
CT_primary1=CT_primary[-c(3,6)]
CT_primary2=unique(CT_primary1)
CT_primary3=left_join(sum,CT_primary2)
## Joining, by = "zip"
# Load zipcode
data(zipcode)
zipcode=zipcode[zipcode$state=="CT",]
# Join school data with zipcode
CT_primary_zip=join(CT_primary3,zipcode,by='zip',type='left')
CT_primary_zip2=CT_primary_zip[CT_primary_zip$mvpsqft<400, ]
#attach(CT_data_zip2)
ggplot(CT_primary_zip2, aes(CT_primary_zip2$mvpsqft, CT_primary_zip2$math, color = CT_primary_zip2$mvpsqft)) +
geom_point(shape = 16, size = 3, show.legend = FALSE) +
theme_minimal()+geom_smooth(method=lm)+ggtitle("Primiary School Math Achievement and Median Home Value per
Sqft, 2016-17, CT")+xlab("Median Home Value per
Sqft")+ylab("Pecent Student Proficient on Math State
Assessments")+theme(legend.position = "none")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
# Fit linear regression
fit1=lm(CT_primary_zip2$math~CT_primary_zip2$mvpsqft)
summary(fit1)
##
## Call:
## lm(formula = CT_primary_zip2$math ~ CT_primary_zip2$mvpsqft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.561 -10.829 0.161 8.907 35.189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.01068 4.56570 -0.659 0.511
## CT_primary_zip2$mvpsqft 0.37613 0.03175 11.846 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.73 on 126 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5269, Adjusted R-squared: 0.5232
## F-statistic: 140.3 on 1 and 126 DF, p-value: < 2.2e-16
CT_primary_zip3=cbind(CT_primary_zip2,predicted=predict(fit1,data.frame(CT_primary_zip2$mvpsqft)))
value=CT_primary_zip3$math-CT_primary_zip3$predicted
CT_primary_zip4=cbind(CT_primary_zip3,value)
# Load map data
us<-map_data('state')
CT<-us[us$region=="connecticut",]
ggplot(CT_primary_zip4,aes(longitude,latitude)) +
geom_polygon(data=CT,aes(x=long,y=lat,group=group),color='gray',fill=NA,alpha=.5)+
geom_point(aes(color = CT_primary_zip4$value),size=3,alpha=.5)+labs(color="Primary School Value-Added")+
ggtitle("CT Primary School Math Value-Added")
#write.csv(CT_primary_zip4,file="C:/Users/yul13011/Desktop/Data Incubator Challenge/Challenge/CT Primiary.csv")
You can also embed plots, for example:
## Joining, by = "zip"
##
## Call:
## lm(formula = CT_middle_zip2$math ~ CT_middle_zip2$mvpsqft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.102 -9.022 1.054 10.039 23.501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.62579 5.78138 0.454 0.651
## CT_middle_zip2$mvpsqft 0.28908 0.03872 7.466 1.63e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.12 on 71 degrees of freedom
## Multiple R-squared: 0.4398, Adjusted R-squared: 0.4319
## F-statistic: 55.74 on 1 and 71 DF, p-value: 1.63e-10
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
### High School Analysis ###
sum3=CT_high %>%
group_by(zip)%>%
summarize_at(vars(math_prof_1617),funs(mean(.,na.rm=T)))
colnames(sum3)=c("zip","math")
CT_high1=CT_high[-c(3,6)]
CT_high2=unique(CT_high1)
CT_high3=left_join(sum3,CT_high2)
## Joining, by = "zip"
CT_high_zip=join(CT_high3,zipcode,by='zip',type='left')
CT_high_zip2=CT_high_zip[CT_high_zip$mvpsqft<400, ]
ggplot(CT_high_zip2, aes(CT_high_zip2$mvpsqft, CT_high_zip2$math, color = CT_high_zip2$mvpsqft)) +
geom_point(shape = 16, size = 3, show.legend = FALSE) +
theme_minimal()+geom_smooth(method=lm)+ggtitle("High School Math Achievement and Median Home Value per
Sqft, 2016-17, CT")+xlab("Median Home Value per
Sqft")+ylab("Pecent Student Proficient on Math State
Assessments")+theme(legend.position = "none")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
# Fit linear regression
fit3=lm(CT_high_zip2$math~CT_high_zip2$mvpsqft)
summary(fit3)
##
## Call:
## lm(formula = CT_high_zip2$math ~ CT_high_zip2$mvpsqft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.055 -12.539 -1.315 12.582 29.726
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.21600 6.51857 -0.954 0.343
## CT_high_zip2$mvpsqft 0.35032 0.04497 7.789 3.79e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.28 on 72 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.4573, Adjusted R-squared: 0.4498
## F-statistic: 60.67 on 1 and 72 DF, p-value: 3.791e-11
CT_high_zip3=cbind(CT_high_zip2,predicted=predict(fit3,data.frame(CT_high_zip2$mvpsqft)))
value3=CT_high_zip3$math-CT_high_zip3$predicted
CT_high_zip4=cbind(CT_high_zip3,value3)
ggplot(CT_high_zip4,aes(longitude,latitude)) +
geom_polygon(data=CT,aes(x=long,y=lat,group=group),color='gray',fill=NA,alpha=.5)+
geom_point(aes(color = CT_high_zip4$value),size=3,alpha=.5)+labs(color="High School Value-Added")+
scale_color_gradient(low="black", high="red")+
ggtitle("CT High School Math Value-Added")
#write.csv(CT_high_zip4,file="C:/Users/yul13011/Desktop/Data Incubator Challenge/Challenge/CT high.csv")