#Install required packages
pkg <- c("sqldf", "tidyverse")
new.packages <- pkg[!(pkg %in%
                        installed.packages()[,"Package"])]
if(length(new.packages))install.packages(new.packages)
#Set working directory and read the CSV file
setwd("C:\\Users\\cod\\Desktop\\PhD Files\\Course Works\\2nd Year Falls Semester\\Urban Analytics\\Assignment\\AssignmentR_RBasics\\AssignmentR_RBasics")
energyNY <- read.csv("EnergyNY.CSV", header = T, stringsAsFactors = F)
  1. List unique postcodes.
unique(energyNY$Postcode)
##  [1] 10304 10301 11432 11435 11415 11379 11101 11211 11225 11201 10466 10457
## [13] 10451 10456 10035 10019 10002 10013 10007 10038
  1. Show how many unique postcodes there are
length(unique(energyNY$Postcode))
## [1] 20

Another way of getting the number of unique postcodes in dataset

nrow(unique(energyNY[c("Postcode")]))
## [1] 20
  1. Show how many records (rows) each postcode has.
table(energyNY$Postcode)
## 
## 10002 10007 10013 10019 10035 10038 10301 10304 10451 10456 10457 10466 11101 
##     1     9     7     1     1     1     5     2     2     1     1     1     1 
## 11201 11211 11225 11379 11415 11432 11435 
##     7     1     1     1     2     1     2

Another way to get the number of records for each postcode

plyr::count(energyNY$Postcode)
##        x freq
## 1  10002    1
## 2  10007    9
## 3  10013    7
## 4  10019    1
## 5  10035    1
## 6  10038    1
## 7  10301    5
## 8  10304    2
## 9  10451    2
## 10 10456    1
## 11 10457    1
## 12 10466    1
## 13 11101    1
## 14 11201    7
## 15 11211    1
## 16 11225    1
## 17 11379    1
## 18 11415    2
## 19 11432    1
## 20 11435    2
  1. Create a table of max energy usage per borough. (5 total rows, 2 columns). Create nice column headings for the table.
maxEnergy <- aggregate(energyNY[,"EnergyUse_MMBTU"], 
                       by = list(energyNY$Borough),FUN = max)
maxEnergy
##         Group.1      x
## 1         Bronx  62113
## 2      Brooklyn 104757
## 3     Manhattan 126511
## 4        Queens  53873
## 5 Staten Island  20745
colnames(maxEnergy)[1:2] <- c("Borough", "Max_Energy_Usage")

maxEnergy
##         Borough Max_Energy_Usage
## 1         Bronx            62113
## 2      Brooklyn           104757
## 3     Manhattan           126511
## 4        Queens            53873
## 5 Staten Island            20745
  1. Create a bar chart that has number of records per postcode. Label the bar chart nicely, including title. Not all zip code numbers have to show up.
data <- data.frame(table(energyNY$Postcode))

colnames(data)[1:2] <- c("Postcode", "Frequency")

ggplot(data = data, mapping = aes(x = reorder(Postcode, Frequency), y = Frequency))+
  geom_col(aes(fill = Postcode),show.legend = F)+ labs(title = "Number of Records per Postcode", x = "Postcode",
                   y = "Frequency")+ scale_fill_brewer(palette = "Paired")+
  scale_y_continuous(breaks = scales::pretty_breaks())+
  xlim("10007","10013","11201","10301","10304","10451","11415","11435","10002","10019")
## Warning: Removed 10 rows containing missing values (position_stack).

  1. A nearby CSV has zip code population for NYC. Create a scatter plot of population by energy usage. (You need a table join for this). Label the chart nicely, including title.
nycData <- read.csv("NYCZIPCodePop.csv", header = T, stringsAsFactors = F)

fulldata <- merge(energyNY, nycData, by.x = "Postcode", by.y = "ZipCode")

head(fulldata)
##   Postcode        BBL BoroughNumber                BuildingName
## 1    10002 1004230022             1                115 Chrystie
## 2    10007 1001600021             1     Manhattan Supreme Court
## 3    10007 1001550001             1       Court Square Building
## 4    10007 1001540023             1      City Planning Building
## 5    10007 1001530024             1 Manhattan Surrogate's Court
## 6    10007 1001530018             1   Emigrant Savings Building
##               Address   Borough    State Block Lot EnergyUse_MMBTU Latitude
## 1 115 Chrystie Street Manhattan New York   423  22            1974 40.71865
## 2       60 Centre St  Manhattan New York   160  21           36789 40.71447
## 3     2 Lafayette St  Manhattan New York   155   1           29227 40.71372
## 4        22 Reade St  Manhattan New York   154  23            8951 40.71423
## 5     31 Chambers St  Manhattan New York   153  24           15059 40.71340
## 6  49-51 Chambers St  Manhattan New York   153  18           19933 40.71377
##   Longitude Community.Board CouncilDistrict Tract     BIN
## 1 -73.99359               3               1    18 1005645
## 2 -74.00250               1               1    29 1085748
## 3 -74.00359               1               1    31 1001672
## 4 -74.00490               1               1    31 1078613
## 5 -74.00463               1               1    31 1001670
## 6 -74.00543               1               1    31 1079216
##                                                                           NTA
## 1 Chinatown                                                                  
## 2 Chinatown                                                                  
## 3 SoHo-TriBeCa-Civic Center-Little Italy                                     
## 4 SoHo-TriBeCa-Civic Center-Little Italy                                     
## 5 SoHo-TriBeCa-Civic Center-Little Italy                                     
## 6 SoHo-TriBeCa-Civic Center-Little Italy                                     
##   Population  X
## 1      74479 NA
## 2       7408 NA
## 3       7408 NA
## 4       7408 NA
## 5       7408 NA
## 6       7408 NA
ggplot(data = fulldata, mapping = aes(x = Population, y = EnergyUse_MMBTU))+
  geom_point(size = 3, fill = "yellow", shape = 21)+
  labs(title = "Populaiton by Energy Usage", x = "Population",
                   y = "Energy Usage")+ 
  scale_colour_brewer(palette = "Greens")+ 
  scale_y_continuous(labels = scales::comma)