#Install required packages
pkg <- c("sqldf", "tidyverse")
new.packages <- pkg[!(pkg %in%
installed.packages()[,"Package"])]
if(length(new.packages))install.packages(new.packages)
#Set working directory and read the CSV file
setwd("C:\\Users\\cod\\Desktop\\PhD Files\\Course Works\\2nd Year Falls Semester\\Urban Analytics\\Assignment\\AssignmentR_RBasics\\AssignmentR_RBasics")
energyNY <- read.csv("EnergyNY.CSV", header = T, stringsAsFactors = F)
unique(energyNY$Postcode)
## [1] 10304 10301 11432 11435 11415 11379 11101 11211 11225 11201 10466 10457
## [13] 10451 10456 10035 10019 10002 10013 10007 10038
length(unique(energyNY$Postcode))
## [1] 20
Another way of getting the number of unique postcodes in dataset
nrow(unique(energyNY[c("Postcode")]))
## [1] 20
table(energyNY$Postcode)
##
## 10002 10007 10013 10019 10035 10038 10301 10304 10451 10456 10457 10466 11101
## 1 9 7 1 1 1 5 2 2 1 1 1 1
## 11201 11211 11225 11379 11415 11432 11435
## 7 1 1 1 2 1 2
Another way to get the number of records for each postcode
plyr::count(energyNY$Postcode)
## x freq
## 1 10002 1
## 2 10007 9
## 3 10013 7
## 4 10019 1
## 5 10035 1
## 6 10038 1
## 7 10301 5
## 8 10304 2
## 9 10451 2
## 10 10456 1
## 11 10457 1
## 12 10466 1
## 13 11101 1
## 14 11201 7
## 15 11211 1
## 16 11225 1
## 17 11379 1
## 18 11415 2
## 19 11432 1
## 20 11435 2
maxEnergy <- aggregate(energyNY[,"EnergyUse_MMBTU"],
by = list(energyNY$Borough),FUN = max)
maxEnergy
## Group.1 x
## 1 Bronx 62113
## 2 Brooklyn 104757
## 3 Manhattan 126511
## 4 Queens 53873
## 5 Staten Island 20745
colnames(maxEnergy)[1:2] <- c("Borough", "Max_Energy_Usage")
maxEnergy
## Borough Max_Energy_Usage
## 1 Bronx 62113
## 2 Brooklyn 104757
## 3 Manhattan 126511
## 4 Queens 53873
## 5 Staten Island 20745
data <- data.frame(table(energyNY$Postcode))
colnames(data)[1:2] <- c("Postcode", "Frequency")
ggplot(data = data, mapping = aes(x = reorder(Postcode, Frequency), y = Frequency))+
geom_col(aes(fill = Postcode),show.legend = F)+ labs(title = "Number of Records per Postcode", x = "Postcode",
y = "Frequency")+ scale_fill_brewer(palette = "Paired")+
scale_y_continuous(breaks = scales::pretty_breaks())+
xlim("10007","10013","11201","10301","10304","10451","11415","11435","10002","10019")
## Warning: Removed 10 rows containing missing values (position_stack).
nycData <- read.csv("NYCZIPCodePop.csv", header = T, stringsAsFactors = F)
fulldata <- merge(energyNY, nycData, by.x = "Postcode", by.y = "ZipCode")
head(fulldata)
## Postcode BBL BoroughNumber BuildingName
## 1 10002 1004230022 1 115 Chrystie
## 2 10007 1001600021 1 Manhattan Supreme Court
## 3 10007 1001550001 1 Court Square Building
## 4 10007 1001540023 1 City Planning Building
## 5 10007 1001530024 1 Manhattan Surrogate's Court
## 6 10007 1001530018 1 Emigrant Savings Building
## Address Borough State Block Lot EnergyUse_MMBTU Latitude
## 1 115 Chrystie Street Manhattan New York 423 22 1974 40.71865
## 2 60 Centre St Manhattan New York 160 21 36789 40.71447
## 3 2 Lafayette St Manhattan New York 155 1 29227 40.71372
## 4 22 Reade St Manhattan New York 154 23 8951 40.71423
## 5 31 Chambers St Manhattan New York 153 24 15059 40.71340
## 6 49-51 Chambers St Manhattan New York 153 18 19933 40.71377
## Longitude Community.Board CouncilDistrict Tract BIN
## 1 -73.99359 3 1 18 1005645
## 2 -74.00250 1 1 29 1085748
## 3 -74.00359 1 1 31 1001672
## 4 -74.00490 1 1 31 1078613
## 5 -74.00463 1 1 31 1001670
## 6 -74.00543 1 1 31 1079216
## NTA
## 1 Chinatown
## 2 Chinatown
## 3 SoHo-TriBeCa-Civic Center-Little Italy
## 4 SoHo-TriBeCa-Civic Center-Little Italy
## 5 SoHo-TriBeCa-Civic Center-Little Italy
## 6 SoHo-TriBeCa-Civic Center-Little Italy
## Population X
## 1 74479 NA
## 2 7408 NA
## 3 7408 NA
## 4 7408 NA
## 5 7408 NA
## 6 7408 NA
ggplot(data = fulldata, mapping = aes(x = Population, y = EnergyUse_MMBTU))+
geom_point(size = 3, fill = "yellow", shape = 21)+
labs(title = "Populaiton by Energy Usage", x = "Population",
y = "Energy Usage")+
scale_colour_brewer(palette = "Greens")+
scale_y_continuous(labels = scales::comma)