#library(tidyverse)
library(ggplot2)
library(ggmap)
library(readxl)
library(zipcode)
library(datasets)

load the data

filepath <- "C:/Users/teeja/Desktop/CleanGithubProjects/data_science_with_R/week7/MedianZIP_2_2.xlsx"
  
df <- read_excel(filepath, skip = 1)
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C7058 / R7058C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26134 / R26134C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26135 / R26135C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26136 / R26136C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26203 / R26203C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C29647 / R29647C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C29982 / R29982C3: got '.'
head(df)
## # A tibble: 6 x 4
##     Zip Median  Mean   Pop
##   <dbl>  <dbl> <dbl> <dbl>
## 1  1001  56663 66688 16445
## 2  1002  49853 75063 28069
## 3  1003  28462 35121  8491
## 4  1005  75423 82442  4798
## 5  1007  79076 85802 12962
## 6  1008  63980 78391  1244

summary of the data

summary (df)
##       Zip            Median               Mean               Pop        
##  Min.   : 1001   Min.   :    32.98   Min.   :    53.6   Min.   :     1  
##  1st Qu.:27301   1st Qu.: 38462.00   1st Qu.: 48593.2   1st Qu.:   736  
##  Median :49875   Median : 46503.32   Median : 56949.6   Median :  2756  
##  Mean   :49875   Mean   : 50938.21   Mean   : 63452.2   Mean   :  9193  
##  3rd Qu.:72134   3rd Qu.: 58255.50   3rd Qu.: 70341.2   3rd Qu.: 12513  
##  Max.   :99929   Max.   :223106.17   Max.   :361842.3   Max.   :113916  
##                                      NA's   :7

update the columns {zip, median, mean, population}

colnames(df) <- c("zip","median", "mean", "population")
summary(df)
##       zip            median               mean            population    
##  Min.   : 1001   Min.   :    32.98   Min.   :    53.6   Min.   :     1  
##  1st Qu.:27301   1st Qu.: 38462.00   1st Qu.: 48593.2   1st Qu.:   736  
##  Median :49875   Median : 46503.32   Median : 56949.6   Median :  2756  
##  Mean   :49875   Mean   : 50938.21   Mean   : 63452.2   Mean   :  9193  
##  3rd Qu.:72134   3rd Qu.: 58255.50   3rd Qu.: 70341.2   3rd Qu.: 12513  
##  Max.   :99929   Max.   :223106.17   Max.   :361842.3   Max.   :113916  
##                                      NA's   :7

Themean has 7 NA’s.

remove all the NA’s from the dataframe

#function to remove na
remove_na <- function(df, n=0){
  df[rowSums(is.na(df)) <= n,]
}

df <- remove_na(df)

summary(df)
##       zip            median               mean            population    
##  Min.   : 1001   Min.   :    32.98   Min.   :    53.6   Min.   :     1  
##  1st Qu.:27300   1st Qu.: 38462.94   1st Qu.: 48593.2   1st Qu.:   736  
##  Median :49872   Median : 46502.73   Median : 56949.6   Median :  2757  
##  Mean   :49870   Mean   : 50938.83   Mean   : 63452.2   Mean   :  9194  
##  3rd Qu.:72129   3rd Qu.: 58256.00   3rd Qu.: 70341.2   3rd Qu.: 12516  
##  Max.   :99929   Max.   :223106.17   Max.   :361842.3   Max.   :113916
data("zipcode")

dfzip <- data.frame(zipcode)

head(dfzip)
##     zip       city state latitude longitude
## 1 00210 Portsmouth    NH  43.0059  -71.0132
## 2 00211 Portsmouth    NH  43.0059  -71.0132
## 3 00212 Portsmouth    NH  43.0059  -71.0132
## 4 00213 Portsmouth    NH  43.0059  -71.0132
## 5 00214 Portsmouth    NH  43.0059  -71.0132
## 6 00215 Portsmouth    NH  43.0059  -71.0132

standardized the zipcode in df

df$zip <- zipcode::clean.zipcodes(df$zip)
head(df)
## # A tibble: 6 x 4
##   zip   median  mean population
##   <chr>  <dbl> <dbl>      <dbl>
## 1 01001  56663 66688      16445
## 2 01002  49853 75063      28069
## 3 01003  28462 35121       8491
## 4 01005  75423 82442       4798
## 5 01007  79076 85802      12962
## 6 01008  63980 78391       1244

Merge the two dataframe together

df_merged <- merge(df, dfzip)

head(df_merged)
##     zip   median     mean population        city state latitude longitude
## 1 01001 56662.57 66687.75      16445      Agawam    MA 42.07061 -72.62029
## 2 01002 49853.42 75062.63      28069     Amherst    MA 42.37765 -72.50323
## 3 01003 28462.00 35121.00       8491     Amherst    MA 42.36956 -72.63599
## 4 01005 75423.00 82442.00       4798       Barre    MA 42.41209 -72.10443
## 5 01007 79076.35 85801.98      12962 Belchertown    MA 42.27842 -72.41100
## 6 01008 63980.00 78391.00       1244   Blandford    MA 42.17431 -72.94828

number of rows in df_merged

nrow(df_merged)
## [1] 32627

check unique state

length(unique(df_merged$state))
## [1] 51

create a dataframe with full state name and state abbreviation

dfstate <- data.frame(state.abb,state.name)
colnames(dfstate) <- c('state','statename')
head(dfstate)
##   state  statename
## 1    AL    Alabama
## 2    AK     Alaska
## 3    AZ    Arizona
## 4    AR   Arkansas
## 5    CA California
## 6    CO   Colorado
length(unique(dfstate$state))
## [1] 50

remove alaska and hawaii

df_merged <- df_merged[which(!(df_merged$state == 'HI' | df_merged$state == 'AK')),]

length(unique(df_merged$state))
## [1] 49
dfstate <- dfstate[which(!(dfstate$state == 'HI' | dfstate$state == 'AK')),]

length(unique(dfstate$state))
## [1] 48

list the unique state in df_merged

unique(df_merged$state)
##  [1] "MA" "RI" "NH" "ME" "VT" "CT" "NY" "NJ" "PA" "DE" "DC" "VA" "MD" "WV"
## [15] "NC" "SC" "GA" "TN" "FL" "AL" "AR" "KY" "MS" "OH" "IN" "MI" "IA" "IL"
## [29] "WI" "MN" "SD" "ND" "MT" "MO" "KS" "NE" "LA" "OK" "TX" "NM" "CO" "WY"
## [43] "ID" "UT" "AZ" "NV" "CA" "OR" "WA"

list the unique state in dfState

unique(dfstate$state)
##  [1] AL AZ AR CA CO CT DE FL GA ID IL IN IA KS KY LA ME MD MA MI MN MS MO
## [24] MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV
## [47] WI WY
## 50 Levels: AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ... WY

check the difference in the state list as one is 49, and the other is 48

setdiff(df_merged$state, dfstate$state)
## [1] "DC"

romove dc from df merged

df_merged <- df_merged[which(!(df_merged$state == 'DC')),]

length(unique(df_merged$state))
## [1] 48

Merge the state dataframe

df_clean <- merge(dfstate, df_merged)
head(df_clean)
##   state statename   zip   median     mean population       city latitude
## 1    AL   Alabama 36071 34712.39 45707.75        766   Rutledge 31.70884
## 2    AL   Alabama 36075 41282.95 50787.74       2226    Shorter 32.39999
## 3    AL   Alabama 36078 41203.15 57269.85      13179  Tallassee 32.55304
## 4    AL   Alabama 35004 55416.04 60783.88      10427      Moody 33.60638
## 5    AL   Alabama 35005 48546.78 54406.99       7942 Adamsville 33.59258
## 6    AL   Alabama 35006 59241.18 65892.12       3121      Adger 33.45171
##   longitude
## 1 -86.38617
## 2 -85.92370
## 3 -85.91360
## 4 -86.50249
## 5 -86.95969
## 6 -87.23957

check the length of the unique state

length(unique(df_clean$state))
## [1] 48

Convert state to lower case . ggmap works with lowercase

df_clean$statename <- tolower(df_clean$statename)
df_clean$city <- tolower(df_clean$city)
head(df_clean)
##   state statename   zip   median     mean population       city latitude
## 1    AL   alabama 36071 34712.39 45707.75        766   rutledge 31.70884
## 2    AL   alabama 36075 41282.95 50787.74       2226    shorter 32.39999
## 3    AL   alabama 36078 41203.15 57269.85      13179  tallassee 32.55304
## 4    AL   alabama 35004 55416.04 60783.88      10427      moody 33.60638
## 5    AL   alabama 35005 48546.78 54406.99       7942 adamsville 33.59258
## 6    AL   alabama 35006 59241.18 65892.12       3121      adger 33.45171
##   longitude
## 1 -86.38617
## 2 -85.92370
## 3 -85.91360
## 4 -86.50249
## 5 -86.95969
## 6 -87.23957

View the summary statistics of the dataframe

summary(df_clean)
##      state        statename             zip                median         
##  TX     : 1904   Length:32284       Length:32284       Min.   :    32.98  
##  PA     : 1790   Class :character   Class :character   1st Qu.: 38447.86  
##  NY     : 1755   Mode  :character   Mode  :character   Median : 46475.90  
##  CA     : 1744                                         Mean   : 50898.64  
##  IL     : 1383                                         3rd Qu.: 58126.12  
##  OH     : 1192                                         Max.   :223106.17  
##  (Other):22516                                                            
##       mean            population         city              latitude    
##  Min.   :    53.6   Min.   :     1   Length:32284       Min.   :24.57  
##  1st Qu.: 48559.7   1st Qu.:   748   Class :character   1st Qu.:35.47  
##  Median : 56907.9   Median :  2788   Mode  :character   Median :39.51  
##  Mean   : 63407.1   Mean   :  9211                      Mean   :38.81  
##  3rd Qu.: 70258.1   3rd Qu.: 12554                      3rd Qu.:42.08  
##  Max.   :361842.3   Max.   :113916                      Max.   :49.34  
##                                                                        
##    longitude      
##  Min.   :-124.64  
##  1st Qu.: -97.02  
##  Median : -88.15  
##  Mean   : -90.49  
##  3rd Qu.: -80.30  
##  Max.   : -67.00  
## 

show income and population per state

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
df_income <- sqldf('select statename as state, AVG(median) as medincome, sum(population) as sumpop from df_clean group by state')

head(df_income)
##         state medincome   sumpop
## 1     alabama  40549.90  4770242
## 2    arkansas  36960.95  2936699
## 3     arizona  48132.06  6360679
## 4  california  62625.91 36926889
## 5    colorado  56303.02  4979279
## 6 connecticut  78520.16  3548308
summary(df_income)
##     state             medincome         sumpop        
##  Length:48          Min.   :34210   Min.   :  555216  
##  Class :character   1st Qu.:43847   1st Qu.: 1842437  
##  Mode  :character   Median :49054   Median : 4510296  
##                     Mean   :51330   Mean   : 6195112  
##                     3rd Qu.:55537   3rd Qu.: 6956398  
##                     Max.   :82208   Max.   :36926889

Map US using map_data

us <- map_data('state')
head(us)
##        long      lat group order  region subregion
## 1 -87.46201 30.38968     1     1 alabama      <NA>
## 2 -87.48493 30.37249     1     2 alabama      <NA>
## 3 -87.52503 30.37249     1     3 alabama      <NA>
## 4 -87.53076 30.33239     1     4 alabama      <NA>
## 5 -87.57087 30.32665     1     5 alabama      <NA>
## 6 -87.58806 30.32665     1     6 alabama      <NA>

MAP

Median income by state

g <- ggplot(df_income, aes(map_id = state)) 
g <- g + geom_map(map=us, aes(fill=df_income$medincome))
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + ggtitle('Median Income by State') + theme_light()
g <- g + scale_fill_gradient(low = 'lightblue', high = 'darkblue', name ='Median Income')
g <- g + xlab('Longitude') + ylab('Latitude')

g

MAP: Population by state

g <- ggplot(df_income, aes(map_id = state)) + geom_map(map=us,aes(fill=df_income$sumpop))
g <- g +  expand_limits(x=us$long, y=us$lat) + coord_map()

g <- g + ggtitle('Population by State') + theme_light()
g <- g + scale_fill_gradient(low = 'lightblue', high = 'darkblue', name ='Median Income')
g <- g + xlab('Longitude') + ylab('Latitude')
g

Income per zipcode

g <- ggplot(df_clean,aes(map_id = statename))
g <- g + geom_map(map=us)
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + geom_point(aes(x=df_clean$longitude, y=df_clean$latitude, color= median))  + theme_light()
g <- g + ggtitle('Median income by zip code') 
g <- g + xlab('Longitude') + ylab('Latitude')
#g <- g + scale_fill_gradientn(colours = rainbow(4), name ='Median Income by Zip')
g

Zip code density

g <- ggplot(df_clean,aes(map_id = statename)) + geom_map(map=us, fill = 'white', color ='black')
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + stat_density2d(data = df_clean, aes(x= longitude, y= latitude,fill =..level..,alpha= ..level.. ),size=2,bins=5, geom = "polygon")
g <- g + theme_classic() + ggtitle('Zipcode density map of US')
g <- g + xlab('Longitude') + ylab('Latitude')
g

Zoom to NY

#New York latlon
#latlon <- geocode("New York",output = "latlona", source = 'dsk')


g <- ggplot(df_clean,aes(map_id = statename))
g <- g + geom_map(map=us)
#g <- g + expand_limits(x=latlon$lon, y=latlon$lat) + coord_map()
g <- g +  xlim(-80,-67) + ylim(37.5,47.5) + coord_map()
g <- g + geom_point(aes(x=df_clean$longitude, y=df_clean$latitude, color= median))  + theme_light()
g <- g + ggtitle('Median income by zip code') 
g <- g + xlab('Longitude') + ylab('Latitude')
#g <- g + scale_fill_gradientn(colours = rainbow(4), name ='Median Income by Zip,NY')
g
## Warning: Removed 25448 rows containing missing values (geom_point).

zoom to NY for the density map

g <- ggplot(df_clean,aes(map_id = statename)) + geom_map(map=us, fill = 'white', color ='black')
#g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g +  xlim(-80,-67) + ylim(37.5,47.5) + coord_map()
g <- g + stat_density2d(data = df_clean, aes(x= longitude, y= latitude,fill =..level..,alpha= ..level.. ),size=2,bins=5, geom = "polygon")
g <- g + theme_classic() + ggtitle('Zipcode density map of NY')
g <- g + xlab('Longitude') + ylab('Latitude')
g
## Warning: Removed 25448 rows containing non-finite values (stat_density2d).