#library(tidyverse)
library(ggplot2)
library(ggmap)
library(readxl)
library(zipcode)
library(datasets)
filepath <- "C:/Users/teeja/Desktop/CleanGithubProjects/data_science_with_R/week7/MedianZIP_2_2.xlsx"
df <- read_excel(filepath, skip = 1)
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C7058 / R7058C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26134 / R26134C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26135 / R26135C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26136 / R26136C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C26203 / R26203C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C29647 / R29647C3: got '.'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C29982 / R29982C3: got '.'
head(df)
## # A tibble: 6 x 4
## Zip Median Mean Pop
## <dbl> <dbl> <dbl> <dbl>
## 1 1001 56663 66688 16445
## 2 1002 49853 75063 28069
## 3 1003 28462 35121 8491
## 4 1005 75423 82442 4798
## 5 1007 79076 85802 12962
## 6 1008 63980 78391 1244
summary (df)
## Zip Median Mean Pop
## Min. : 1001 Min. : 32.98 Min. : 53.6 Min. : 1
## 1st Qu.:27301 1st Qu.: 38462.00 1st Qu.: 48593.2 1st Qu.: 736
## Median :49875 Median : 46503.32 Median : 56949.6 Median : 2756
## Mean :49875 Mean : 50938.21 Mean : 63452.2 Mean : 9193
## 3rd Qu.:72134 3rd Qu.: 58255.50 3rd Qu.: 70341.2 3rd Qu.: 12513
## Max. :99929 Max. :223106.17 Max. :361842.3 Max. :113916
## NA's :7
colnames(df) <- c("zip","median", "mean", "population")
summary(df)
## zip median mean population
## Min. : 1001 Min. : 32.98 Min. : 53.6 Min. : 1
## 1st Qu.:27301 1st Qu.: 38462.00 1st Qu.: 48593.2 1st Qu.: 736
## Median :49875 Median : 46503.32 Median : 56949.6 Median : 2756
## Mean :49875 Mean : 50938.21 Mean : 63452.2 Mean : 9193
## 3rd Qu.:72134 3rd Qu.: 58255.50 3rd Qu.: 70341.2 3rd Qu.: 12513
## Max. :99929 Max. :223106.17 Max. :361842.3 Max. :113916
## NA's :7
remove all the NA’s from the dataframe
#function to remove na
remove_na <- function(df, n=0){
df[rowSums(is.na(df)) <= n,]
}
df <- remove_na(df)
summary(df)
## zip median mean population
## Min. : 1001 Min. : 32.98 Min. : 53.6 Min. : 1
## 1st Qu.:27300 1st Qu.: 38462.94 1st Qu.: 48593.2 1st Qu.: 736
## Median :49872 Median : 46502.73 Median : 56949.6 Median : 2757
## Mean :49870 Mean : 50938.83 Mean : 63452.2 Mean : 9194
## 3rd Qu.:72129 3rd Qu.: 58256.00 3rd Qu.: 70341.2 3rd Qu.: 12516
## Max. :99929 Max. :223106.17 Max. :361842.3 Max. :113916
data("zipcode")
dfzip <- data.frame(zipcode)
head(dfzip)
## zip city state latitude longitude
## 1 00210 Portsmouth NH 43.0059 -71.0132
## 2 00211 Portsmouth NH 43.0059 -71.0132
## 3 00212 Portsmouth NH 43.0059 -71.0132
## 4 00213 Portsmouth NH 43.0059 -71.0132
## 5 00214 Portsmouth NH 43.0059 -71.0132
## 6 00215 Portsmouth NH 43.0059 -71.0132
df$zip <- zipcode::clean.zipcodes(df$zip)
head(df)
## # A tibble: 6 x 4
## zip median mean population
## <chr> <dbl> <dbl> <dbl>
## 1 01001 56663 66688 16445
## 2 01002 49853 75063 28069
## 3 01003 28462 35121 8491
## 4 01005 75423 82442 4798
## 5 01007 79076 85802 12962
## 6 01008 63980 78391 1244
df_merged <- merge(df, dfzip)
head(df_merged)
## zip median mean population city state latitude longitude
## 1 01001 56662.57 66687.75 16445 Agawam MA 42.07061 -72.62029
## 2 01002 49853.42 75062.63 28069 Amherst MA 42.37765 -72.50323
## 3 01003 28462.00 35121.00 8491 Amherst MA 42.36956 -72.63599
## 4 01005 75423.00 82442.00 4798 Barre MA 42.41209 -72.10443
## 5 01007 79076.35 85801.98 12962 Belchertown MA 42.27842 -72.41100
## 6 01008 63980.00 78391.00 1244 Blandford MA 42.17431 -72.94828
number of rows in df_merged
nrow(df_merged)
## [1] 32627
check unique state
length(unique(df_merged$state))
## [1] 51
dfstate <- data.frame(state.abb,state.name)
colnames(dfstate) <- c('state','statename')
head(dfstate)
## state statename
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
## 4 AR Arkansas
## 5 CA California
## 6 CO Colorado
length(unique(dfstate$state))
## [1] 50
df_merged <- df_merged[which(!(df_merged$state == 'HI' | df_merged$state == 'AK')),]
length(unique(df_merged$state))
## [1] 49
dfstate <- dfstate[which(!(dfstate$state == 'HI' | dfstate$state == 'AK')),]
length(unique(dfstate$state))
## [1] 48
list the unique state in df_merged
unique(df_merged$state)
## [1] "MA" "RI" "NH" "ME" "VT" "CT" "NY" "NJ" "PA" "DE" "DC" "VA" "MD" "WV"
## [15] "NC" "SC" "GA" "TN" "FL" "AL" "AR" "KY" "MS" "OH" "IN" "MI" "IA" "IL"
## [29] "WI" "MN" "SD" "ND" "MT" "MO" "KS" "NE" "LA" "OK" "TX" "NM" "CO" "WY"
## [43] "ID" "UT" "AZ" "NV" "CA" "OR" "WA"
list the unique state in dfState
unique(dfstate$state)
## [1] AL AZ AR CA CO CT DE FL GA ID IL IN IA KS KY LA ME MD MA MI MN MS MO
## [24] MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV
## [47] WI WY
## 50 Levels: AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ... WY
setdiff(df_merged$state, dfstate$state)
## [1] "DC"
df_merged <- df_merged[which(!(df_merged$state == 'DC')),]
length(unique(df_merged$state))
## [1] 48
df_clean <- merge(dfstate, df_merged)
head(df_clean)
## state statename zip median mean population city latitude
## 1 AL Alabama 36071 34712.39 45707.75 766 Rutledge 31.70884
## 2 AL Alabama 36075 41282.95 50787.74 2226 Shorter 32.39999
## 3 AL Alabama 36078 41203.15 57269.85 13179 Tallassee 32.55304
## 4 AL Alabama 35004 55416.04 60783.88 10427 Moody 33.60638
## 5 AL Alabama 35005 48546.78 54406.99 7942 Adamsville 33.59258
## 6 AL Alabama 35006 59241.18 65892.12 3121 Adger 33.45171
## longitude
## 1 -86.38617
## 2 -85.92370
## 3 -85.91360
## 4 -86.50249
## 5 -86.95969
## 6 -87.23957
check the length of the unique state
length(unique(df_clean$state))
## [1] 48
df_clean$statename <- tolower(df_clean$statename)
df_clean$city <- tolower(df_clean$city)
head(df_clean)
## state statename zip median mean population city latitude
## 1 AL alabama 36071 34712.39 45707.75 766 rutledge 31.70884
## 2 AL alabama 36075 41282.95 50787.74 2226 shorter 32.39999
## 3 AL alabama 36078 41203.15 57269.85 13179 tallassee 32.55304
## 4 AL alabama 35004 55416.04 60783.88 10427 moody 33.60638
## 5 AL alabama 35005 48546.78 54406.99 7942 adamsville 33.59258
## 6 AL alabama 35006 59241.18 65892.12 3121 adger 33.45171
## longitude
## 1 -86.38617
## 2 -85.92370
## 3 -85.91360
## 4 -86.50249
## 5 -86.95969
## 6 -87.23957
summary(df_clean)
## state statename zip median
## TX : 1904 Length:32284 Length:32284 Min. : 32.98
## PA : 1790 Class :character Class :character 1st Qu.: 38447.86
## NY : 1755 Mode :character Mode :character Median : 46475.90
## CA : 1744 Mean : 50898.64
## IL : 1383 3rd Qu.: 58126.12
## OH : 1192 Max. :223106.17
## (Other):22516
## mean population city latitude
## Min. : 53.6 Min. : 1 Length:32284 Min. :24.57
## 1st Qu.: 48559.7 1st Qu.: 748 Class :character 1st Qu.:35.47
## Median : 56907.9 Median : 2788 Mode :character Median :39.51
## Mean : 63407.1 Mean : 9211 Mean :38.81
## 3rd Qu.: 70258.1 3rd Qu.: 12554 3rd Qu.:42.08
## Max. :361842.3 Max. :113916 Max. :49.34
##
## longitude
## Min. :-124.64
## 1st Qu.: -97.02
## Median : -88.15
## Mean : -90.49
## 3rd Qu.: -80.30
## Max. : -67.00
##
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
df_income <- sqldf('select statename as state, AVG(median) as medincome, sum(population) as sumpop from df_clean group by state')
head(df_income)
## state medincome sumpop
## 1 alabama 40549.90 4770242
## 2 arkansas 36960.95 2936699
## 3 arizona 48132.06 6360679
## 4 california 62625.91 36926889
## 5 colorado 56303.02 4979279
## 6 connecticut 78520.16 3548308
summary(df_income)
## state medincome sumpop
## Length:48 Min. :34210 Min. : 555216
## Class :character 1st Qu.:43847 1st Qu.: 1842437
## Mode :character Median :49054 Median : 4510296
## Mean :51330 Mean : 6195112
## 3rd Qu.:55537 3rd Qu.: 6956398
## Max. :82208 Max. :36926889
us <- map_data('state')
head(us)
## long lat group order region subregion
## 1 -87.46201 30.38968 1 1 alabama <NA>
## 2 -87.48493 30.37249 1 2 alabama <NA>
## 3 -87.52503 30.37249 1 3 alabama <NA>
## 4 -87.53076 30.33239 1 4 alabama <NA>
## 5 -87.57087 30.32665 1 5 alabama <NA>
## 6 -87.58806 30.32665 1 6 alabama <NA>
Median income by state
g <- ggplot(df_income, aes(map_id = state))
g <- g + geom_map(map=us, aes(fill=df_income$medincome))
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + ggtitle('Median Income by State') + theme_light()
g <- g + scale_fill_gradient(low = 'lightblue', high = 'darkblue', name ='Median Income')
g <- g + xlab('Longitude') + ylab('Latitude')
g
g <- ggplot(df_income, aes(map_id = state)) + geom_map(map=us,aes(fill=df_income$sumpop))
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + ggtitle('Population by State') + theme_light()
g <- g + scale_fill_gradient(low = 'lightblue', high = 'darkblue', name ='Median Income')
g <- g + xlab('Longitude') + ylab('Latitude')
g
g <- ggplot(df_clean,aes(map_id = statename))
g <- g + geom_map(map=us)
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + geom_point(aes(x=df_clean$longitude, y=df_clean$latitude, color= median)) + theme_light()
g <- g + ggtitle('Median income by zip code')
g <- g + xlab('Longitude') + ylab('Latitude')
#g <- g + scale_fill_gradientn(colours = rainbow(4), name ='Median Income by Zip')
g
g <- ggplot(df_clean,aes(map_id = statename)) + geom_map(map=us, fill = 'white', color ='black')
g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + stat_density2d(data = df_clean, aes(x= longitude, y= latitude,fill =..level..,alpha= ..level.. ),size=2,bins=5, geom = "polygon")
g <- g + theme_classic() + ggtitle('Zipcode density map of US')
g <- g + xlab('Longitude') + ylab('Latitude')
g
#New York latlon
#latlon <- geocode("New York",output = "latlona", source = 'dsk')
g <- ggplot(df_clean,aes(map_id = statename))
g <- g + geom_map(map=us)
#g <- g + expand_limits(x=latlon$lon, y=latlon$lat) + coord_map()
g <- g + xlim(-80,-67) + ylim(37.5,47.5) + coord_map()
g <- g + geom_point(aes(x=df_clean$longitude, y=df_clean$latitude, color= median)) + theme_light()
g <- g + ggtitle('Median income by zip code')
g <- g + xlab('Longitude') + ylab('Latitude')
#g <- g + scale_fill_gradientn(colours = rainbow(4), name ='Median Income by Zip,NY')
g
## Warning: Removed 25448 rows containing missing values (geom_point).
g <- ggplot(df_clean,aes(map_id = statename)) + geom_map(map=us, fill = 'white', color ='black')
#g <- g + expand_limits(x=us$long, y=us$lat) + coord_map()
g <- g + xlim(-80,-67) + ylim(37.5,47.5) + coord_map()
g <- g + stat_density2d(data = df_clean, aes(x= longitude, y= latitude,fill =..level..,alpha= ..level.. ),size=2,bins=5, geom = "polygon")
g <- g + theme_classic() + ggtitle('Zipcode density map of NY')
g <- g + xlab('Longitude') + ylab('Latitude')
g
## Warning: Removed 25448 rows containing non-finite values (stat_density2d).