pop_estimates_all<-rbind(pop_estimates_2019,pop_estimates_2018,pop_estimates_2017,pop_estimates_2016,
pop_estimates_2015,pop_estimates_2014,pop_estimates_2014,pop_estimates_2013,
pop_estimates_2012,pop_estimates_2011,pop_estimates_2010)
clean_all<-rbind(clean_2019,clean_2018,clean_2017,clean_2016,clean_2015,clean_2014,clean_2013,clean_2012,clean_2011,clean_2010)
pop_estimates_all<-pop_estimates_all%>%filter(COUNTY>0)
pop_estimates_all<-pop_estimates_all%>%select("CTYNAME","STNAME","POPESTIMATE","YEAR")
pop_estimates_all<-pop_estimates_all%>%rename(COUNTY=CTYNAME)
pop_estimates_all$COUNTY<-str_remove_all(pop_estimates_all$COUNTY, " County")
clean_all<-clean_all%>%rename("COUNTY"="CNTYNAMB")
clean_all<-clean_all%>%rename("STNAME"="STNAMEBR")
####With such a vast amount of data, I believed that this would work to my advantage when it came to having enough observations when looking at sub regions. This proved to be challenging when it came to visualizing the data and identifying outliers becuase of slight variations across data.
####Finally, I was able to merge the data after much trial and error. Getting to this point took much longer than I had anticipated:
clean_combined<-merge(x=clean_all,y=pop_estimates_all)
summary(clean_combined$DEPSUMBR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 19447 40103 110814 76365 407675546
clean_combined%>%arrange(desc(DEPSUMBR))%>%head(5)
## STNAME YEAR COUNTY ADDRESBR BKCLASS CITYBR
## 1 New York 2019 New York 390 Madison Avenue N New York
## 2 South Dakota 2019 Minnehaha 101 N. Phillips Avenue N Sioux Falls
## 3 New York 2017 New York 401 Madison Avenue N New York
## 4 New York 2018 New York 401 Madison Avenue N New York
## 5 New York 2016 New York 401 Madison Avenue N New York
## DEPSUM DEPSUMBR METROBR MICROBR NAMEBR
## 1 0 407675546 1 0 Madison Ave And 47th St
## 2 1291135000 353866534 1 0 Wells Fargo Bank, National Association
## 3 0 348521473 1 0 Madison And 48th St Branch
## 4 0 326155491 1 0 Madison And 48th St Branch
## 5 0 313686851 1 0 Madison And 48th St Branch
## NAMEFULL SIMS_LATITUDE SIMS_LONGITUDE ZIPBR
## 1 JPMorgan Chase Bank, National Association 40.75606 -73.97697 10017
## 2 Wells Fargo Bank, National Association 43.54766 -96.72684 57104
## 3 JPMorgan Chase Bank, National Association 40.68580 -73.94230 10017
## 4 JPMorgan Chase Bank, National Association 40.68580 -73.94230 10017
## 5 JPMorgan Chase Bank, National Association 40.68580 -73.94230 10017
## BRSERTYP POPESTIMATE
## 1 11 1628706
## 2 11 193134
## 3 11 1630698
## 4 11 1629055
## 5 11 1635443
clean_combined%>%arrange(DEPSUMBR)%>%head(5)
## STNAME YEAR COUNTY ADDRESBR BKCLASS CITYBR DEPSUM
## 1 Alabama 2010 Autauga 744 East Main Street SM Prattville 0
## 2 Alabama 2010 Baldwin 1301 South Mckenzie Street SM Foley 0
## 3 Alabama 2010 Baldwin 615 Mcmeans Avenue NM Bay Minette 0
## 4 Alabama 2010 Calhoun 900 Wilmer Avenue SM Anniston 0
## 5 Alabama 2010 Clarke 102 Cobb Street NM Grove Hill 0
## DEPSUMBR METROBR MICROBR NAMEBR NAMEFULL
## 1 0 1 0 Prattville Rdt Branch Regions Bank
## 2 0 1 0 Foley Branch Regions Bank
## 3 0 1 0 Bay Minette Branch United Bank
## 4 0 1 0 Anniston Main Rdt Branch Regions Bank
## 5 0 0 0 Cobb Street Branch First United Security Bank
## SIMS_LATITUDE SIMS_LONGITUDE ZIPBR BRSERTYP POPESTIMATE
## 1 32.46007 -86.45584 36067 23 54773
## 2 30.39211 -87.68327 36535 11 183112
## 3 30.88357 -87.78754 36507 11 183112
## 4 33.65574 -85.82848 36201 23 118408
## 5 31.70993 -87.77676 36451 23 25766
##Code that defines the type of service the branch office provides: ###11- Full Service, brick and mortar office ###12- Full Service, retail office ####13- Full Service, cyber office ####21- Limited Service, administrative office ####22- Limited Service, military facility ####23- Limited Service, drive-through facility ####24- Limited Service, loan production office ####25- Limited Service, consumer credit office ####26- Limited Service, contractual office ####27- Limited Service, messenger office ####28- Limited Service, retail office ####29- Limited Service, mobile/seasonal office ####30- Limited Service, trust office"
ggplot(clean_combined,aes(BRSERTYP))+geom_bar()
clean_combined%>%filter(DEPSUMBR==0)%>%count()
## n
## 1 34216
clean_combined<-clean_combined%>%mutate(BKCLASS=as.factor(BKCLASS))
clean_combined_branches<-clean_combined%>%filter(BRSERTYP==c(11,12))%>%filter(DEPSUMBR>0)
## Warning in BRSERTYP == c(11, 12): longer object length is not a multiple of
## shorter object length
ggplot(clean_combined_branches,aes(x="",y=log10(DEPSUMBR)))+geom_boxplot()
outliers<-boxplot(clean_combined_branches$DEPSUMBR)$out
#I will use the function below to remove any of these observations that are considered outliers:
outlier_test1<-clean_combined_branches[-which(clean_combined_branches$DEPSUMBR %in% outliers),]
clean_combined_branches_nooutliers<-outlier_test1
ggplot(outlier_test1,aes(x="",y=DEPSUMBR))+geom_boxplot()
bank_data%>%arrange(desc(DEPSUMBR))%>%head(5)
## STNAME YEAR COUNTY ADDRESBR BKCLASS CITYBR
## 1 Virginia 2014 Caroline 211 North Main Street SM Bowling Green
## 2 California 2019 Los Angeles 3500 East 7th Street N Long Beach
## 3 New Jersey 2015 Bergen 71 Union Avenue N East Rutherford
## 4 Tennessee 2018 Hamilton 1969 Northpoint Boulevard SM Hixson
## 5 California 2014 San Diego 8222 Mira Mesa Boulevard N San Diego
## DEPSUM DEPSUMBR METROBR MICROBR NAMEBR
## 1 0 164036 1 0 Bowling Green Branch
## 2 0 164033 1 0 3500 East 7th Street Branch
## 3 0 164033 1 0 East Rutherford Branch
## 4 0 164029 1 0 Northgate Mall Branch
## 5 0 164028 1 0 Mira Mesa Branch
## NAMEFULL SIMS_LATITUDE SIMS_LONGITUDE ZIPBR
## 1 Union First Market Bank 38.05221 -77.34801 22427
## 2 CIT Bank, National Association 33.77533 -118.15104 90804
## 3 TD Bank, National Association 40.82679 -74.09695 7073
## 4 SunTrust Bank 35.13670 -85.23980 37343
## 5 JPMorgan Chase Bank, National Association 32.91274 -117.14599 92126
## BRSERTYP POPESTIMATE d0_50MM d50_100MM d100_150MM d150_200MM d200_250MM
## 1 11 29741 1 0 0 0 0
## 2 11 10039107 1 0 0 0 0
## 3 11 926391 1 0 0 0 0
## 4 11 364293 1 0 0 0 0
## 5 11 3248877 1 0 0 0 0
## d250_300MM d300_350MM d350_400MM d400_450MM d450_500MM
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
ggplot(bank_data,aes(YEAR,DEPSUMBR))+geom_col()+facet_wrap(~regions)
## Error: At least one layer must contain all faceting variables: `regions`.
## * Plot is missing `regions`
## * Layer 1 is missing `regions`
ggplot(bank_data1,aes(YEAR,DEPSUMBR))+geom_col()+facet_wrap(~regions)
## Error: At least one layer must contain all faceting variables: `regions`.
## * Plot is missing `regions`
## * Layer 1 is missing `regions`
ggplot(bank_data1,aes(YEAR,DEPSUMBR))+geom_col()+facet_wrap(~MICROBR)+ggtitle("sum of deposits in population area of greater than 10k but less than 50K")
ggplot(bank_data1,aes(YEAR,DEPSUMBR))+geom_col()+facet_wrap(~METROBR)+ggtitle("sum of deposits in population area of 50K or more")
summary(bank_data$POPESTIMATE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 451 75790 309323 843334 904962 10105708
0-4999 5000-19,999 20,000-99,999 100,000-499,999 500,000-999,999 1,000,000+
pop_0_5_data<-clean_combined_branches_popsegments%>%filter(pop_0_5==1)
## Error in eval(lhs, parent, parent): object 'clean_combined_branches_popsegments' not found
pop_5_20_data<-clean_combined_branches_popsegments%>%filter(pop_5_20==1)
## Error in eval(lhs, parent, parent): object 'clean_combined_branches_popsegments' not found
pop_20_100_data<-clean_combined_branches_popsegments%>%filter(pop_20_100==1)
## Error in eval(lhs, parent, parent): object 'clean_combined_branches_popsegments' not found
pop_100_500_data<-clean_combined_branches_popsegments%>%filter(pop_100_500==1)
## Error in eval(lhs, parent, parent): object 'clean_combined_branches_popsegments' not found
pop_500_1000_data<-clean_combined_branches_popsegments%>%filter(pop_500_1000==1)
## Error in eval(lhs, parent, parent): object 'clean_combined_branches_popsegments' not found
pop_1000_up_data<-clean_combined_branches_popsegments%>%filter(pop_1000_up==1)
## Error in eval(lhs, parent, parent): object 'clean_combined_branches_popsegments' not found
summary(lm(DEPSUMBR~POPESTIMATE,data=cleaned_combined_branches_segmented))
##
## Call:
## lm(formula = DEPSUMBR ~ POPESTIMATE, data = cleaned_combined_branches_segmented)
##
## Residuals:
## Min 1Q Median 3Q Max
## -299088 -85350 -66125 -33584 407546429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.644e+04 3.507e+03 27.50 <2e-16 ***
## POPESTIMATE 2.006e-02 1.839e-03 10.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2109000 on 472371 degrees of freedom
## Multiple R-squared: 0.0002518, Adjusted R-squared: 0.0002497
## F-statistic: 119 on 1 and 472371 DF, p-value: < 2.2e-16
summary(lm(DEPSUMBR~POPESTIMATE, data=bank_data1))
##
## Call:
## lm(formula = DEPSUMBR ~ POPESTIMATE, data = bank_data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83721 -26909 -8944 18742 118479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.504e+04 2.062e+02 218.4 <2e-16 ***
## POPESTIMATE 3.839e-03 1.177e-04 32.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35950 on 39468 degrees of freedom
## Multiple R-squared: 0.02623, Adjusted R-squared: 0.0262
## F-statistic: 1063 on 1 and 39468 DF, p-value: < 2.2e-16
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.