# The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work. 

# Your task is to: (1) Choose any three of the "wide" datasets identified in the Week 6 Discussion items.  (You may use your own dataset; please don't use my Sample Post dataset, since that was used in your Week 6 assignment!)  
# For each of the three chosen datasets: ???
# Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset.  You're encouraged to use a "wide" structure similar to how the information appears in the discussion item, 
# so that you can practice tidying and transformations as described below. ???
# Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.  [Most of your grade will be based on this step!] ???
# Perform the analysis requested in the discussion item. ??? Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. 

# Please include in your homework submission, for each of the three chosen datasets: ??? 
# The URL to the .Rmd file in your GitHub repository, and  ???
# The URL for your rpubs.com web page. 

# Setting up environment
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)


# Dataset 1:Energy Efficiency Projects
data1 <- read.csv("https://raw.githubusercontent.com/Jennier2015/DATA-607/master/Project%202/Energy_Efficiency_Projects.csv")
head(data1)
##        PrimaryProjectType             SubType ProjectStatus
## 1     HVAC System Upgrade                         Completed
## 2 Lighting System Upgrade            Retrofit     Completed
## 3 Lighting System Upgrade            Retrofit       Install
## 4           Comprehensive            Retrofit     Cancelled
## 5            Energy Audit                None     Completed
## 6                   Other Retro-commissioning      Proposed
##   StartDateActual FinishDateActual                  ProjectSiteName
## 1                                                                  
## 2       09-Apr-12        28-Sep-12           Leonard Branch Library
## 3       14-May-12        28-Sep-12 Washington Irving Branch Library
## 4                                   Brooklyn Heights Branch Library
## 5       16-Mar-10        23-Aug-10  Brooklyn Heights Branch Library
## 6                                   Brooklyn Heights Branch Library
##      ProjectSiteAddress Acronym CO2e..MT..Calculated
## 1                           BPL                   NA
## 2       81 Devoe Street     BPL                15.10
## 3     360 Irving Avenue     BPL                11.90
## 4 280 Cadman Plaza West     BPL               117.07
## 5     286 Fulton Street     BPL               135.53
## 6 280 Cadman Plaza West     BPL                   NA
# The City's Long-Term Sustainability Plan, PlaNYC, calls for City government to reduce greenhouse gas emissions produced by municipal operations 
# 30% by 2017. A large part of this effort is retrofitting municipal buildings to become more energy efficient. This dataset lists energy 
# efficiency projects at City buildings that are complete or in progress. Projects are listed by address, building name, and agency, 
# and the dataset includes the estimated reduction in tons of GHG emissions associated with the project. In summary, this dataset has 584 rows and 9 columns.
# It was provided by NYC OpenData (Thanks to Duubar Vallalobos Jimenez from Disscussion Forum)

# Data transformation
# Tidying the Data
new_data1 <- data1 %>% separate(StartDateActual, c("year", "month", "day"), sep = "-")
## Warning: Too few values at 268 locations: 1, 4, 6, 7, 8, 9, 10, 11, 12, 14,
## 17, 19, 20, 21, 28, 30, 33, 34, 35, 36, ...
head(new_data1)
##        PrimaryProjectType             SubType ProjectStatus year month
## 1     HVAC System Upgrade                         Completed       <NA>
## 2 Lighting System Upgrade            Retrofit     Completed   09   Apr
## 3 Lighting System Upgrade            Retrofit       Install   14   May
## 4           Comprehensive            Retrofit     Cancelled       <NA>
## 5            Energy Audit                None     Completed   16   Mar
## 6                   Other Retro-commissioning      Proposed       <NA>
##    day FinishDateActual                  ProjectSiteName
## 1 <NA>                                                  
## 2   12        28-Sep-12           Leonard Branch Library
## 3   12        28-Sep-12 Washington Irving Branch Library
## 4 <NA>                   Brooklyn Heights Branch Library
## 5   10        23-Aug-10  Brooklyn Heights Branch Library
## 6 <NA>                   Brooklyn Heights Branch Library
##      ProjectSiteAddress Acronym CO2e..MT..Calculated
## 1                           BPL                   NA
## 2       81 Devoe Street     BPL                15.10
## 3     360 Irving Avenue     BPL                11.90
## 4 280 Cadman Plaza West     BPL               117.07
## 5     286 Fulton Street     BPL               135.53
## 6 280 Cadman Plaza West     BPL                   NA
new_data1 <-  data1 %>% select(PrimaryProjectType:ProjectStatus)
head(new_data1)
##        PrimaryProjectType             SubType ProjectStatus
## 1     HVAC System Upgrade                         Completed
## 2 Lighting System Upgrade            Retrofit     Completed
## 3 Lighting System Upgrade            Retrofit       Install
## 4           Comprehensive            Retrofit     Cancelled
## 5            Energy Audit                None     Completed
## 6                   Other Retro-commissioning      Proposed
# Analysis
table1 <- table(new_data1$PrimaryProjectType)
View(table1)
barplot(table(new_data1$PrimaryProjectType),
        main = "Energy Efficiency Projects",
        xlab = c("Comprehensive", "Audit", "HVAC", "Lighting", "Other"),
        ylab = "Frequency")

table2 <- table(new_data1$ProjectStatus)
View(table2)
sum(table2)
## [1] 584
# Conclusion: In NYC, there are more lighting upgrade projects than the other kinds of energy efficiency projects. And other than regular and common types of projects, there are still a good amount of other projects
# for energy efficiency. And regarding the project status, out of 584 projects, there are 260 completed projects, which is a good rate of projects accomplished.

# Dataset 2: 
data2 <- read.csv("https://raw.githubusercontent.com/Jennier2015/DATA-607/master/Project%202/Energy_Usage_From_DOE_Buildings.csv")
head(data2)
##                                         Address             Measurement
## 1   512 West 212th St [Shorac Kappock School]\n Electricity Demand (KW)
## 2   512 West 212th St [Shorac Kappock School]\n Electricity Usage (KWH)
## 3   512 West 212th St [Shorac Kappock School]\n            Gas (Therms)
## 4   512 West 212th St [Shorac Kappock School]\n    Total Usage (mmBTUs)
## 5 232 East 103rd St [Roberto Clemente School]\n Electricity Demand (KW)
## 6 232 East 103rd St [Roberto Clemente School]\n Electricity Usage (KWH)
##          X.        X_ X.FY.2009..7.1.2008    Aug.08    Sep.08    Oct.08
## 1   $144.00    $68.00             $148.00   $158.00   $166.00   $180.00
## 2 $30400.00 $31000.00           $46200.00 $45200.00 $51800.00 $58400.00
## 3   $109.00   $112.00             $305.00  $2298.00  $7666.00 $13710.00
## 4   $114.65   $117.00             $188.18   $384.07   $943.39  $1570.32
## 5   $123.20   $150.40             $150.40   $124.80   $136.00   $144.00
## 6 $53920.00 $58560.00           $56640.00 $48640.00 $54560.00 $61440.00
##      Nov.08    Dec.08    Jan.09    Feb.09    Mar.09    Apr.09    May.09
## 1   $168.00   $166.00   $166.00   $162.00   $142.00   $146.00   $146.00
## 2 $56600.00 $52200.00 $55200.00 $44200.00 $48600.00 $47400.00 $34200.00
## 3 $16197.00 $12846.00 $10322.00  $4660.00  $1138.00   $348.00   $150.00
## 4  $1812.87  $1462.76  $1220.60   $616.85   $279.67   $196.57   $131.72
## 5   $144.00   $144.00   $142.40   $142.40   $145.60   $140.80   $128.00
## 6 $61600.00 $60640.00 $60800.00 $51200.00 $48160.00 $45280.00 $52000.00
##      Jun.09 X.FY.2010.7.1.2009    Aug.09    Sep.09    Oct.09    Nov.09
## 1    $90.00            $148.00   $152.00   $170.00   $178.00   $184.00
## 2 $30600.00          $44800.00 $52400.00 $54200.00 $63200.00 $60800.00
## 3   $112.00            $243.00  $2835.00  $5873.00 $14903.00 $15662.00
## 4   $115.64            $177.20   $462.34   $772.28  $1706.00  $1773.71
## 5   $134.40            $148.80   $128.00   $128.00   $128.00   $126.40
## 6 $57280.00          $52160.00 $52320.00 $47040.00 $57440.00 $48800.00
##      Dec.09    Jan.10    Feb.10    Mar.10    Apr.10    May.10    Jun.10
## 1   $190.00   $188.00   $172.00   $158.00   $154.00   $136.00    $64.00
## 2 $63200.00 $58000.00 $48200.00 $50800.00 $52200.00 $33000.00 $28600.00
## 3 $16322.00  $8197.00  $2966.00  $1298.00   $275.00   $106.00    $93.00
## 4  $1847.90  $1017.65   $461.11   $303.18   $205.66   $123.23   $106.91
## 5   $126.40   $124.80   $121.60   $140.80   $160.00   $153.60   $139.20
## 6 $52000.00 $44160.00 $38720.00 $39200.00 $48160.00 $53280.00 $50240.00
##   X.FY.2011.7.1.2010    Aug.10    Sep.10    Oct.10    Nov.10    Dec.10
## 1            $140.00   $156.00   $166.00   $178.00   $178.00   $178.00
## 2          $42600.00 $52800.00 $53800.00 $65200.00 $55600.00 $59600.00
## 3            $193.00  $1645.00  $6448.00 $16825.00 $16787.00 $14045.00
## 4            $164.69   $344.70   $828.42  $1905.03  $1868.46  $1607.91
## 5            $156.80   $148.80   $134.40   $134.40   $134.40   $134.40
## 6          $45440.00 $46080.00 $42080.00 $48800.00 $41600.00 $44640.00
##      Jan.11    Feb.11    Mar.11    Apr.11    May.11    Jun.11
## 1   $172.00   $164.00   $154.00   $170.00   $156.00    $82.00
## 2 $58000.00 $48200.00 $49600.00 $52000.00 $31400.00 $20600.00
## 3 $10629.00  $5373.00   $831.00   $339.00   $124.00   $115.00
## 4  $1260.85   $701.81   $252.38   $211.37   $119.57    $81.81
## 5   $129.60   $131.20   $124.80   $182.40   $171.20   $145.60
## 6 $40800.00 $37600.00 $40160.00 $53920.00 $48320.00 $54560.00
##   X.FY.2012.7.1.2011    Aug.11    Sep.11    Oct.11    Nov.11    Dec.11
## 1             $58.00   $168.00   $164.00   $162.00   $166.00   $160.00
## 2           $9800.00 $91600.00 $58800.00 $53400.00 $57600.00 $58600.00
## 3            $273.00   $778.00  $6037.00  $8103.00 $13972.00 $12298.00
## 4             $60.75   $390.43   $804.38   $992.55  $1593.79  $1429.80
## 5            $174.40   $169.60   $139.20   $132.80   $132.80   $132.80
## 6          $54560.00 $44640.00 $46400.00 $42080.00 $40640.00 $43680.00
##      Jan.12    Feb.12    Mar.12    Apr.12 May.12 Jun.12
## 1   $156.00   $146.00   $146.00   $156.00     NA     NA
## 2 $50400.00 $43800.00 $47400.00 $53600.00     NA     NA
## 3  $6866.00  $3781.00  $1410.00   $302.00     NA     NA
## 4   $858.61   $527.59   $302.77   $213.14     NA     NA
## 5   $128.00   $158.40   $134.40   $187.20     NA     NA
## 6 $42560.00 $38240.00 $42880.00 $52640.00     NA     NA
View(data2)

# The dataset is from Energy data from a select portfolio of City-owned buildings (DOE) provided by NYC OpenData.

# Data transformation
# Tidying the Data
# Because the 2008 and 2012 data is not for the whole year, I would like to exclude the data from 2008 and 2012.
data_whole <- data2[ -c(3:10, 47:52)]
View(data_whole)


# I want to select the 2009 dataset for total usage and do some analysis.
data_2009 <- filter(data_whole, data_whole$Measurement == "Total Usage (mmBTUs)")
View(data_2009)
data_2009_totalusage <- select(data_2009, Jan.09 :Dec.09)

View(data_2009_totalusage) 

# Change fiscal month to regular name Jul.09 
colnames(data_2009_totalusage)[7] <- "Jul.09"
View(data_2009_totalusage)




# Dataset 3: Wastewater Treatment Plants
# The dataset is provided by Department of Environmental Protection (DEP).
data3 <- read.csv("https://raw.githubusercontent.com/Jennier2015/DATA-607/master/Project%202/Wastewater_Treatment_Plants.csv")
head(data3)
##   Wastewater.Treatment.Plant.Name Location Plant.in.operation
## 1                  26th Ward WWTP     East               1944
## 2                 Bowery Bay WWTP    North               1939
## 3               Coney Island WWTP     East               1935
## 4                Hunts Point WWTP    North               1952
## 5                    Jamaica WWTP     East        1903 / 1943
## 6              Newtown Creek WWTP    South               1967
##   Design.Capacity       Dewatering Population.Served Receiving.Waterbody
## 1          85 MGD        26th Ward            283428         Jamaica Bay
## 2         150 MGD       Bowery Bay            848328    Upper East River
## 3         110 MGD        26th Ward            596326         Jamaica Bay
## 4         200 MGD      Hunts Point            684569    Upper East River
## 5         100 MGD     Jamaica WWTP            728123         Jamaica Bay
## 6         310 MGD Hunts Point WWTP           1068012          East River
##                                                                                                                Drainage.Area
## 1                                                                 5,907 Acres, eastern section of Brooklyn, near Jamaica Bay
## 2                                                                                  15,203 Acres, northeast section of Queens
## 3                                                                                   15,087 Acres, south and central Brooklyn
## 4                                                                                 16,664 Acres, eastern section of the Bronx
## 5                                                                                   25,313 Acres, southern section of Queens
## 6 15,656 Acres, south and eastern midtown sections of Manhattan, northeast section of Brooklyn and western section of Queens
##   Plant.Staff Note
## 1          93     
## 2          81     
## 3          69     
## 4         108     
## 5          66     
## 6          88
# Tidying the Data
data3_analysis <- select(data3, Location, Population.Served)
View(data3_analysis)
arrange(data3_analysis, Location)
##    Location Population.Served
## 1      East            283428
## 2      East            596326
## 3      East            728123
## 4      East             90474
## 5      East            758007
## 6     North            848328
## 7     North            684569
## 8     North           1061558
## 9     North            410812
## 10    South           1068012
## 11    South            588772
## 12    South            198128
## 13    South            192050
## 14    South            244918
data3_analysis %>% 
  summarise(mean = mean(data3_analysis$Population.Served), sum = sum(data3_analysis$Population.Served), n = n())
##       mean     sum  n
## 1 553821.8 7753505 14
# The mean population servied for each is 553821.8, and total popluation is 7753505.

# Analysis Question: How are the wastewater treatment plants spreaded out based on location?
plot(data3_analysis$Population.Served ~ data3_analysis$Location)

# Conclusion: 1. The average population servied for each plant is 553821.8. 2. The average population servied at north are higher than East and South.