R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(zipcode)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(viridis)
## Loading required package: viridisLite
library(ggthemes)
library(readxl)
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
## 
##     ozone
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
# Import CT School and real estate price Data 
CT_data <- read_excel("C:/Users/yul13011/Desktop/Data Incubator Challenge/Challenge/CT school realestate.xls")
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
CT_data=cbind(CT_data, zip=paste0("0",CT_data$lzip))

CT_primary=CT_data[CT_data$level==1,]  # Primary school
CT_middle=CT_data[CT_data$level==2,]  # Middle school
CT_high=CT_data[CT_data$level==3,]  # High school


### Primary school analysis ###

sum=CT_primary %>%
  group_by(zip)%>%
  summarize_at(vars(math_prof_1617),funs(mean(.,na.rm=T)))
colnames(sum)=c("zip","math")

CT_primary1=CT_primary[-c(3,6)]
CT_primary2=unique(CT_primary1)
CT_primary3=left_join(sum,CT_primary2)
## Joining, by = "zip"
# Load zipcode
data(zipcode)
zipcode=zipcode[zipcode$state=="CT",]

# Join school data with zipcode 
CT_primary_zip=join(CT_primary3,zipcode,by='zip',type='left')
CT_primary_zip2=CT_primary_zip[CT_primary_zip$mvpsqft<400, ]
#attach(CT_data_zip2)


ggplot(CT_primary_zip2, aes(CT_primary_zip2$mvpsqft, CT_primary_zip2$math, color = CT_primary_zip2$mvpsqft)) +
  geom_point(shape = 16, size = 3, show.legend = FALSE) +
  theme_minimal()+geom_smooth(method=lm)+ggtitle("Primiary School Math Achievement and Median Home Value per 
                                                 Sqft, 2016-17, CT")+xlab("Median Home Value per 
                                                Sqft")+ylab("Pecent Student Proficient on Math State 
                                                Assessments")+theme(legend.position = "none")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

# Fit linear regression 
fit1=lm(CT_primary_zip2$math~CT_primary_zip2$mvpsqft)
summary(fit1)
## 
## Call:
## lm(formula = CT_primary_zip2$math ~ CT_primary_zip2$mvpsqft)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.561 -10.829   0.161   8.907  35.189 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -3.01068    4.56570  -0.659    0.511    
## CT_primary_zip2$mvpsqft  0.37613    0.03175  11.846   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.73 on 126 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5269, Adjusted R-squared:  0.5232 
## F-statistic: 140.3 on 1 and 126 DF,  p-value: < 2.2e-16
CT_primary_zip3=cbind(CT_primary_zip2,predicted=predict(fit1,data.frame(CT_primary_zip2$mvpsqft)))

value=CT_primary_zip3$math-CT_primary_zip3$predicted

CT_primary_zip4=cbind(CT_primary_zip3,value)

# Load map data 
us<-map_data('state')
CT<-us[us$region=="connecticut",]

ggplot(CT_primary_zip4,aes(longitude,latitude)) +
   geom_polygon(data=CT,aes(x=long,y=lat,group=group),color='gray',fill=NA,alpha=.5)+
   geom_point(aes(color = CT_primary_zip4$value),size=3,alpha=.5)+labs(color="Primary School Value-Added")+
  ggtitle("CT Primary School Math Value-Added")

#write.csv(CT_primary_zip4,file="C:/Users/yul13011/Desktop/Data Incubator Challenge/Challenge/CT Primiary.csv")

Including Plots

You can also embed plots, for example:

## Joining, by = "zip"

## 
## Call:
## lm(formula = CT_middle_zip2$math ~ CT_middle_zip2$mvpsqft)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.102  -9.022   1.054  10.039  23.501 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.62579    5.78138   0.454    0.651    
## CT_middle_zip2$mvpsqft  0.28908    0.03872   7.466 1.63e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.12 on 71 degrees of freedom
## Multiple R-squared:  0.4398, Adjusted R-squared:  0.4319 
## F-statistic: 55.74 on 1 and 71 DF,  p-value: 1.63e-10

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

### High School Analysis ###

sum3=CT_high %>%
  group_by(zip)%>%
  summarize_at(vars(math_prof_1617),funs(mean(.,na.rm=T)))
colnames(sum3)=c("zip","math")

CT_high1=CT_high[-c(3,6)]
CT_high2=unique(CT_high1)
CT_high3=left_join(sum3,CT_high2)
## Joining, by = "zip"
CT_high_zip=join(CT_high3,zipcode,by='zip',type='left')
CT_high_zip2=CT_high_zip[CT_high_zip$mvpsqft<400, ]

ggplot(CT_high_zip2, aes(CT_high_zip2$mvpsqft, CT_high_zip2$math, color = CT_high_zip2$mvpsqft)) +
  geom_point(shape = 16, size = 3, show.legend = FALSE) +
  theme_minimal()+geom_smooth(method=lm)+ggtitle("High School Math Achievement and Median Home Value per 
                                                 Sqft, 2016-17, CT")+xlab("Median Home Value per 
                                                Sqft")+ylab("Pecent Student Proficient on Math State 
                                                Assessments")+theme(legend.position = "none")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

# Fit linear regression 
fit3=lm(CT_high_zip2$math~CT_high_zip2$mvpsqft)
summary(fit3)
## 
## Call:
## lm(formula = CT_high_zip2$math ~ CT_high_zip2$mvpsqft)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.055 -12.539  -1.315  12.582  29.726 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.21600    6.51857  -0.954    0.343    
## CT_high_zip2$mvpsqft  0.35032    0.04497   7.789 3.79e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.28 on 72 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.4573, Adjusted R-squared:  0.4498 
## F-statistic: 60.67 on 1 and 72 DF,  p-value: 3.791e-11
CT_high_zip3=cbind(CT_high_zip2,predicted=predict(fit3,data.frame(CT_high_zip2$mvpsqft)))

value3=CT_high_zip3$math-CT_high_zip3$predicted

CT_high_zip4=cbind(CT_high_zip3,value3)

ggplot(CT_high_zip4,aes(longitude,latitude)) +
   geom_polygon(data=CT,aes(x=long,y=lat,group=group),color='gray',fill=NA,alpha=.5)+
   geom_point(aes(color = CT_high_zip4$value),size=3,alpha=.5)+labs(color="High School Value-Added")+
  scale_color_gradient(low="black", high="red")+
  ggtitle("CT High School Math Value-Added")

#write.csv(CT_high_zip4,file="C:/Users/yul13011/Desktop/Data Incubator Challenge/Challenge/CT high.csv")