# Set the working directory and read in the files
if (Sys.info()["sysname"] == "Windows") {
setwd("~/Masters/DATA606/DataProject")
} else {
setwd("~/Documents/Masters/DATA606/DataProject")
}
fileurl = "PublicLEEDProjectDirectory.csv"
project_data <- read.csv(file = fileurl, header = TRUE, sep = ",", stringsAsFactors = FALSE)
svfileurl = "SummaryVersionsMap.csv"
sv_data <- read.csv(file = svfileurl, header = TRUE, sep = ",", stringsAsFactors = FALSE)
# Format dates
project_data$CertDate <- as.Date(project_data$CertDate, "%m/%d/%Y")
project_data$RegistrationDate <- as.Date(project_data$RegistrationDate, "%m/%d/%Y")
# Add two calculated columns. SubtmittalTime is the number of days between certification
# and registration dates (completed projects). SubmittalTimePending is the number of days
# between the date the information was pulled and the registration date (open projects).
project_data$SubmittalTime <- as.numeric(project_data$CertDate - project_data$RegistrationDate)
latest_date <- max(project_data$RegistrationDate, na.rm = TRUE)
project_data$SubmittalTimePending <- ifelse(is.na(project_data$CertDate) == TRUE, latest_date -
project_data$RegistrationDate, NA)
# Run sapply() to do a table join between the project directory data and the
# SummaryVersionsMap file. The latter file is a mapping scheme that provides further
# sub-category information to each project type.
project_data$RatingSystemFamily <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 2]
x <- ifelse(identical(x, character(0)), character(0), as.character(x))
})
project_data$System <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 3]
x <- ifelse(identical(x, character(0)), character(0), as.character(x))
})
project_data$Version <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 4]
x <- ifelse(identical(x, character(0)), character(0), as.character(x))
})
project_data$Platform <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 5]
x <- ifelse(identical(x, character(0)), character(0), as.character(x))
})
project_data$RSLaunch <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 6]
})
project_data$RSLaunch <- as.Date(as.character(project_data$RSLaunch), "%m/%d/%Y")
project_data$RSRegClose <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 7]
})
project_data$RSRegClose <- as.Date(as.character(project_data$RSRegClose), "%m/%d/%Y")
project_data$RSSunset <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 8]
})
project_data$RSSunset <- as.Date(as.character(project_data$RSSunset), "%m/%d/%Y")
project_data$RSIteration <- sapply(project_data$LEEDSystemVersionDisplayName, function(x) {
x <- sv_data[which(sv_data$RatingSystem == x), 9]
})
project_data$RSIteration <- as.integer(as.character(project_data$RSIteration))
## Warning: NAs introduced by coercion
# Add a few more calculated columns now that the sub-category data has been added.
project_data$RegAfterLaunch <- as.integer(project_data$RegistrationDate - project_data$RSLaunch)
project_data$CertAfterLaunch <- as.integer(project_data$CertDate - project_data$RSLaunch)
project_data$RegYear <- trunc(project_data$RegAfterLaunch/365) + 1
project_data$CertYear <- trunc(project_data$CertAfterLaunch/365) + 1
# Create more generalized categories of the Owner types.
project_data$OwnerTypesAdj <- sapply(project_data$OwnerTypes, function(x) {
x <- ifelse(grepl("government", tolower(x)), "government", ifelse(grepl("education", tolower(x)),
"education", ifelse(grepl("confidential", tolower(x)), "confidential", ifelse(grepl("corporate",
tolower(x)), "corporate", ifelse(grepl("^profit", tolower(x)), "profit", ifelse(grepl("^non-profit",
tolower(x)), "non-profit", ifelse(grepl("individual", tolower(x)), "individual",
ifelse(grepl("investor", tolower(x)), "investor", ifelse(x == "", NA, ifelse(is.null(x),
NULL, "other"))))))))))
})
# Days the v3 platform has been open since launch
maxv3days <- as.numeric(latest_date - as.Date("4/27/2009", "%m/%d/%Y"))
project_data_adjusted <- project_data
# If a v2 project is pending longer than the maximum amount of days v3 has been open, then
# change the length to that time
project_data_adjusted$SubmittalTimePending <- ifelse(project_data_adjusted$Platform == "v2" &
project_data_adjusted$SubmittalTimePending > maxv3days, maxv3days, project_data_adjusted$SubmittalTimePending)
# If it took a v2 project to certify longer than the maximum amount of v3 days, then we
# will consider it an open project with the maximum v3 days of pending time.
project_data_adjusted$SubmittalTimePending <- ifelse(project_data_adjusted$Platform == "v2" &
project_data_adjusted$SubmittalTime > maxv3days, maxv3days, project_data_adjusted$SubmittalTimePending)
# For the same issue noted above, make sure the SubmittalTime column (days the project was
# registered before being certified) returns an NA for theses projects.
project_data_adjusted$SubmittalTime <- ifelse(project_data_adjusted$Platform == "v2" & project_data$SubmittalTime >
maxv3days, NA, project_data_adjusted$SubmittalTime)
# Make sure all columns have good data and that v4, ND, and Homes projects are excluded due
# to small size.
completed_projects <- subset(project_data_adjusted, project_data_adjusted$SubmittalTime > 0 &
!(is.na(project_data_adjusted$CertDate)) & !(identical(project_data_adjusted$RatingSystemFamily,
character(0))) & !(is.na(project_data_adjusted$RatingSystemFamily)) & !(project_data_adjusted$RatingSystemFamily ==
"ND") & !(project_data_adjusted$RatingSystemFamily == "Homes") & !(project_data_adjusted$Platform ==
"v4"))
# Same as above but with projects that have not certified.
open_projects <- subset(project_data_adjusted, project_data_adjusted$SubmittalTimePending >
0 & is.na(project_data_adjusted$CertDate) & !(identical(project_data_adjusted$RatingSystemFamily,
character(0))) & !(is.na(project_data_adjusted$Platform)) & !(project_data_adjusted$RatingSystemFamily ==
"ND") & !(project_data_adjusted$RatingSystemFamily == "Homes") & !(project_data_adjusted$Platform ==
"v4"))
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Is there a significant difference between the number of days it takes for a project to certify (from the time it was regiestered) between rating system families? Is the gross floor area a good indicator of the time it takes for a project to certify?
What are the cases, and how many are there?
Each case represents a registered LEED project. There are 126237 observations in the overall data set. After filtering to projects with sufficient information, and pertinent categories, there are 67889 observations.
Describe the method of data collection.
Upon registering a LEED project, the information in this data set is populated as a part of the requirments of the LEED process.
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
The project data is collected by the United States Green Building Council and is available online here: http://www.usgbc.org/project-download-all. The version mapping table, used to break projects down into sub-categories, is not publicly available in a single format. however, all the infomration needed to create this data set is available on http://www.usgbc.org.
What is the response variable, and what type is it (numerical/categorical)?
The response variable is number of days between a project’s registration and certification date (SubmittalTime). This is a discrete numerical value.
What is the explanatory variable, and what type is it (numerical/categorical)?
The explanatory variables are the rating system families (categorical, nominal) and gross floor area (numerical, continuous).
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
I have prepared a data set which includes calculations that determine the length of time it took for each project to certify. Since v3 projects can only have been open for 2871 days, v2 projects (previous version) which were certified after 2871 days were considered to be un-certified. These calcuations do not consider how long after the Rating System was launched when the project was registered. This assumption for the data set would be okay as long as we can consider that the amount of time it takes to certify a project does not change as the time between registration date and rating system launch date increases. There are graphs to check this at the end of this section.
require(ggplot2)
## Loading required package: ggplot2
#### Graphs based on present assumptions
table(completed_projects$RatingSystemFamily)
##
## BDC IDC OM
## 20446 8219 4990
table(completed_projects$Platform)
##
## v2 v3
## 14390 19265
# General Rating System Family histogram
ggplot(completed_projects, aes(x = SubmittalTime, fill = RatingSystemFamily)) + geom_histogram(binwidth = 100,
alpha = 0.5, position = "identity", aes(y = ..density..)) + labs(x = "Days Between Registration and Certification") +
ggtitle("Project Age vs. Rating System Family") + theme(plot.title = element_text(hjust = 0.5))
# General Platform ~ Rating System Family boxplot
ggplot(completed_projects, aes(y = SubmittalTime, x = Platform)) + geom_boxplot() + facet_wrap(~RatingSystemFamily) +
labs(y = "Days Between Registration and Certification") + ggtitle("Project Age Broken Down By Platform and Rating System Family") +
theme(plot.title = element_text(hjust = 0.5))
# Summary Tables
tabledata <- subset(completed_projects, completed_projects$RatingSystemFamily == "BDC")
summary(tabledata$SubmittalTime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 744 1055 1132 1450 2870
sd(tabledata$SubmittalTime)
## [1] 537.4658
tabledata <- subset(completed_projects, completed_projects$RatingSystemFamily == "IDC")
summary(tabledata$SubmittalTime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 273 461 527 709 2814
sd(tabledata$SubmittalTime)
## [1] 387.7426
tabledata <- subset(completed_projects, completed_projects$RatingSystemFamily == "OM")
summary(tabledata$SubmittalTime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 255.0 406.5 510.1 657.0 2860.0
sd(tabledata$SubmittalTime)
## [1] 380.6742
#### Rating System Family Tests
completed_projects_BDC <- subset(completed_projects, completed_projects$RatingSystemFamily ==
"BDC")
ggplot(completed_projects_BDC, aes(x = SubmittalTime, fill = Platform)) + geom_histogram(binwidth = 50,
alpha = 0.5, position = "identity", aes(y = ..density..)) + labs(x = "Days Between Registration and Certification") +
ggtitle("BD+C Projects") + theme(plot.title = element_text(hjust = 0.5))
completed_projects_IDC <- subset(completed_projects, completed_projects$RatingSystemFamily ==
"IDC")
ggplot(completed_projects_IDC, aes(x = SubmittalTime, fill = Platform)) + geom_histogram(binwidth = 50,
alpha = 0.5, position = "identity", aes(y = ..density..)) + labs(x = "Days Between Registration and Certification") +
ggtitle("ID+C Projects") + theme(plot.title = element_text(hjust = 0.5))
completed_projects_OM <- subset(completed_projects, completed_projects$RatingSystemFamily ==
"OM")
ggplot(completed_projects_OM, aes(x = SubmittalTime, fill = Platform)) + geom_histogram(binwidth = 50,
alpha = 0.5, position = "identity", aes(y = ..density..)) + labs(x = "Days Between Registration and Certification") +
ggtitle("OM Projects") + theme(plot.title = element_text(hjust = 0.5))
#### Gross floor area vs. submittal time tests
# I have scaled the y axis to exclude some outliers in favor of seeing the main data set
# more clearly
ggplot(completed_projects, aes(y = GrossSqFoot, x = SubmittalTime, color = RatingSystemFamily)) +
geom_point() + scale_y_continuous(limits = c(0, 5e+06)) + labs(x = "Days Between Registration and Certification",
y = "Gross Floor Area") + ggtitle("Project Age vs Total Area") + theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 14 rows containing missing values (geom_point).
# When a facet wrap is applied, it is a little more clear there may be some association
# with at least one rating system.
ggplot(completed_projects, aes(y = GrossSqFoot, x = SubmittalTime)) + geom_point() + facet_wrap(~RatingSystemFamily) +
scale_y_continuous(limits = c(0, 5e+06)) + labs(x = "Days Between Registration and Certification",
y = "Gross Floor Area") + ggtitle("Project Age vs Total Area By Rating System Family") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 14 rows containing missing values (geom_point).
# This is a test to see if project timelines decrease as the time between the registration
# date and the launch date increases Create a data frame of completed v2 projects that are
# at least as old as the v3 launch. This is so that we know we aren't creating a bias with
# recently registered projects that certified quickly.
completed_projects_v2 <- subset(completed_projects, completed_projects$Platform == "v2" & project_data_adjusted$RegYear >
0)
## Warning in completed_projects$Platform == "v2" & project_data_adjusted
## $RegYear > : longer object length is not a multiple of shorter object
## length
table(completed_projects_v2$RegYear)
##
## -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12
## 2 6 42 1690 3044 3361 3890 592 24 21 73 36 26 21 6
completed_projects_v2 <- subset(completed_projects_v2, completed_projects_v2$RegYear < 5 &
completed_projects_v2$RegYear > 0)
ggplot(completed_projects_v2, aes(y = SubmittalTime, x = RatingSystemFamily)) + geom_boxplot() +
facet_wrap(~RegYear) + labs(x = "Rating Sytem Family", y = "Days Between Registration and Certification") +
ggtitle("v2 Projects By Number of Years Registered after Platform Launch") + theme(plot.title = element_text(hjust = 0.5))
completed_projects_v3 <- subset(completed_projects, completed_projects$Platform == "v3")
table(completed_projects_v3$RegYear)
##
## 1 2 3 4 5 6 7 8
## 2283 3430 3569 3240 2694 2075 1352 622
completed_projects_v3 <- subset(completed_projects_v3, completed_projects_v3$RegYear < 5 &
completed_projects_v3$RegYear > 0)
ggplot(completed_projects_v3, aes(y = SubmittalTime, x = RatingSystemFamily)) + geom_boxplot() +
facet_wrap(~RegYear) + labs(x = "Rating Sytem Family", y = "Days Between Registration and Certification") +
ggtitle("v3 Projects By Number of Years Registered after Platform Launch") + theme(plot.title = element_text(hjust = 0.5))
Additional graphs that are outside of the current project scope but may be useful.
# Owner type table
table(completed_projects$OwnerTypesAdj)
##
## confidential corporate education government individual
## 4846 9060 1811 5875 672
## investor non-profit other profit
## 2135 2345 889 5190
# Graph by owner types
ggplot(completed_projects, aes(x = SubmittalTime, fill = RatingSystemFamily)) + geom_histogram(binwidth = 100,
alpha = 0.5, position = "identity", aes(y = ..density..)) + facet_wrap(~OwnerTypesAdj) +
labs(x = "Days Between Registration and Certification") + ggtitle("Owner Type Breakdown") +
theme(plot.title = element_text(hjust = 0.5))
# test for likelyhood of certification
BDC_sub <- subset(project_data_adjusted, project_data_adjusted$RatingSystemFamily == "BDC")
IDC_sub <- subset(project_data_adjusted, project_data_adjusted$RatingSystemFamily == "IDC")
OM_sub <- subset(project_data_adjusted, project_data_adjusted$RatingSystemFamily == "OM")
maxv3years <- max(trunc(project_data_adjusted$SubmittalTime/365) + 1, na.rm = TRUE)
v2_years <- rep(NA, maxv3years)
v2_years_c <- rep(NA, maxv3years)
v2_BDC_years <- rep(NA, maxv3years)
v2_BDC_years_c <- rep(NA, maxv3years)
v2_IDC_years <- rep(NA, maxv3years)
v2_IDC_years_c <- rep(NA, maxv3years)
v2_OM_years <- rep(NA, maxv3years)
v2_OM_years_c <- rep(NA, maxv3years)
v3_years <- rep(NA, maxv3years)
v3_years_c <- rep(NA, maxv3years)
v3_BDC_years <- rep(NA, maxv3years)
v3_BDC_years_c <- rep(NA, maxv3years)
v3_IDC_years <- rep(NA, maxv3years)
v3_IDC_years_c <- rep(NA, maxv3years)
v3_OM_years <- rep(NA, maxv3years)
v3_OM_years_c <- rep(NA, maxv3years)
n <- maxv3years
for (i in 1:maxv3years) {
v2_years[i] <- nrow(subset(project_data_adjusted, project_data_adjusted$Platform == "v2" &
trunc(project_data_adjusted$SubmittalTime/365 + 1) == i))/nrow(subset(project_data_adjusted,
project_data_adjusted$Platform == "v2"))
v2_years_c[i] <- nrow(subset(project_data_adjusted, project_data_adjusted$Platform == "v2" &
trunc(project_data_adjusted$SubmittalTime/365 + 1) <= i))/nrow(subset(project_data_adjusted,
project_data_adjusted$Platform == "v2"))
v2_BDC_years[i] <- nrow(subset(BDC_sub, BDC_sub$Platform == "v2" & trunc(BDC_sub$SubmittalTime/365 +
1) == i))/nrow(subset(BDC_sub, BDC_sub$Platform == "v2"))
v2_BDC_years_c[i] <- nrow(subset(BDC_sub, BDC_sub$Platform == "v2" & trunc(BDC_sub$SubmittalTime/365 +
1) <= i))/nrow(subset(BDC_sub, BDC_sub$Platform == "v2"))
v2_IDC_years[i] <- nrow(subset(IDC_sub, IDC_sub$Platform == "v2" & trunc(IDC_sub$SubmittalTime/365 +
1) == i))/nrow(subset(IDC_sub, IDC_sub$Platform == "v2"))
v2_IDC_years_c[i] <- nrow(subset(IDC_sub, IDC_sub$Platform == "v2" & trunc(IDC_sub$SubmittalTime/365 +
1) <= i))/nrow(subset(IDC_sub, IDC_sub$Platform == "v2"))
v2_OM_years[i] <- nrow(subset(OM_sub, OM_sub$Platform == "v2" & trunc(OM_sub$SubmittalTime/365 +
1) == i))/nrow(subset(OM_sub, OM_sub$Platform == "v2"))
v2_OM_years_c[i] <- nrow(subset(OM_sub, OM_sub$Platform == "v2" & trunc(OM_sub$SubmittalTime/365 +
1) <= i))/nrow(subset(OM_sub, OM_sub$Platform == "v2"))
v3_years[i] <- nrow(subset(project_data_adjusted, project_data_adjusted$Platform == "v3" &
trunc(project_data_adjusted$SubmittalTime/365 + 1) == i))/nrow(subset(project_data_adjusted,
project_data_adjusted$Platform == "v3"))
v3_years_c[i] <- nrow(subset(project_data_adjusted, project_data_adjusted$Platform == "v3" &
trunc(project_data_adjusted$SubmittalTime/365 + 1) <= i))/nrow(subset(project_data_adjusted,
project_data_adjusted$Platform == "v3"))
v3_BDC_years[i] <- nrow(subset(BDC_sub, BDC_sub$Platform == "v3" & trunc(BDC_sub$SubmittalTime/365 +
1) == i))/nrow(subset(BDC_sub, BDC_sub$Platform == "v3"))
v3_BDC_years_c[i] <- nrow(subset(BDC_sub, BDC_sub$Platform == "v3" & trunc(BDC_sub$SubmittalTime/365 +
1) <= i))/nrow(subset(BDC_sub, BDC_sub$Platform == "v3"))
v3_IDC_years[i] <- nrow(subset(IDC_sub, IDC_sub$Platform == "v3" & trunc(IDC_sub$SubmittalTime/365 +
1) == i))/nrow(subset(IDC_sub, IDC_sub$Platform == "v3"))
v3_IDC_years_c[i] <- nrow(subset(IDC_sub, IDC_sub$Platform == "v3" & trunc(IDC_sub$SubmittalTime/365 +
1) <= i))/nrow(subset(IDC_sub, IDC_sub$Platform == "v3"))
v3_OM_years[i] <- nrow(subset(OM_sub, OM_sub$Platform == "v3" & trunc(OM_sub$SubmittalTime/365 +
1) == i))/nrow(subset(OM_sub, OM_sub$Platform == "v3"))
v3_OM_years_c[i] <- nrow(subset(OM_sub, OM_sub$Platform == "v3" & trunc(OM_sub$SubmittalTime/365 +
1) <= i))/nrow(subset(OM_sub, OM_sub$Platform == "v3"))
}
certification_ratio_df <- data.frame(c(1:maxv3years))
names(certification_ratio_df) <- c("years")
certification_ratio_df$v2 <- v2_years
certification_ratio_df$v3 <- v3_years
certification_ratio_df$v2_BDC <- v2_BDC_years
certification_ratio_df$v2_IDC <- v2_IDC_years
certification_ratio_df$v2_OM <- v2_OM_years
certification_ratio_df$v3_BDC <- v3_BDC_years
certification_ratio_df$v3_IDC <- v3_IDC_years
certification_ratio_df$v3_OM <- v3_OM_years
certification_ratio_df_c <- data.frame(c(1:maxv3years))
names(certification_ratio_df_c) <- c("years")
certification_ratio_df_c$v2 <- v2_years_c
certification_ratio_df_c$v3 <- v3_years_c
certification_ratio_df_c$v2_BDC <- v2_BDC_years_c
certification_ratio_df_c$v2_IDC <- v2_IDC_years_c
certification_ratio_df_c$v2_OM <- v2_OM_years_c
certification_ratio_df_c$v3_BDC <- v3_BDC_years_c
certification_ratio_df_c$v3_IDC <- v3_IDC_years_c
certification_ratio_df_c$v3_OM <- v3_OM_years_c
require(reshape2)
## Loading required package: reshape2
certification_ratio_df_reshape <- melt(certification_ratio_df[, 1:9], id.vars = "years")
certification_ratio_df_reshape_c <- melt(certification_ratio_df_c[, 1:9], id.vars = "years")
certification_ratio_df_reshape$Platform <- sapply(certification_ratio_df_reshape$variable,
function(x) {
x <- ifelse(grepl("v2", x), "v2", "v3")
})
certification_ratio_df_reshape_c$Platform <- sapply(certification_ratio_df_reshape_c$variable,
function(x) {
x <- ifelse(grepl("v2", x), "v2", "v3")
})
ggplot(certification_ratio_df_reshape, aes(y = value, x = years, fill = variable)) + geom_bar(stat = "identity",
position = "dodge") + facet_wrap(~Platform) + labs(x = "Years", y = "Percent of Certified Projects By Year") +
ggtitle("Project Certifications By Year After Registered") + theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = c(1:8))
ggplot(certification_ratio_df_reshape_c, aes(y = value, x = years, fill = variable)) + geom_bar(stat = "identity",
position = "dodge") + facet_wrap(~Platform) + labs(x = "Years", y = "Percent of Certified Projects By Year Cumulatively") +
ggtitle("Project Certifications By Year After Registered - Cumulative") + theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = c(1:8))