Assignment 2

#install.packages("car")
library(Rmisc)

## Loading required package: lattice

## Loading required package: plyr

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::desc()      masks plyr::desc()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
library(readr)
library(pander)
library(dplyr)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

##Import the data “2017_Building_Energy_Benchmarking.csv” and complete the following tasks

data <- read_csv('./data/2017_Building_Energy_Benchmarking.csv')

## Rows: 3461 Columns: 45
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): BuildingType, PrimaryPropertyType, PropertyName, Address, City, St...
## dbl (30): OSEBuildingID, DataYear, ZipCode, CouncilDistrictCode, Latitude, L...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Use pipes in the tidyverse package to subset the data. Throw away rows whose Outlier value is not NA. Then select ENERGYSTARScore, SiteEUIWN(kBtu/sf), PrimaryPropertyType,YearBuilt,GHGEmissionsIntensity (Do not select Outlier). Also, omit NA values. Finally, name this subset seattle_data_subset. Hint: try to use filter() to filter NA values in Outlier

seattle_data_subset <- data %>%
  filter(is.na(Outlier)) %>%
  select(ENERGYSTARScore, `SiteEUIWN(kBtu/sf)`, PrimaryPropertyType, YearBuilt, GHGEmissionsIntensity) %>%
  drop_na()

#Draw a scatterplot between ENERGYSTARScore and SiteEUIWN(kBtu/sf) using car package. Set regLine=FALSE,smooth=FALSE. Hint: How do you input SiteEUIWN(kBtu/sf) ?

scatterplot(ENERGYSTARScore ~ `SiteEUIWN(kBtu/sf)`, data = seattle_data_subset, regLine = FALSE, smooth = FALSE)

#Draw a scatterplot between ENERGYSTARScore and GHGEmissionsIntensity using car package.

scatterplot(ENERGYSTARScore ~ `GHGEmissionsIntensity`, data = seattle_data_subset, regLine = FALSE, smooth = FALSE)

#Use ifelse() statement to collapse ENERGYSTARScore to an ordinal variable. ENERGYSTARScore below 60 should be recoded as “LOW”, ENERGYSTARScore over or equal to 60 but below 90 should be recoded as “MEDIUM”, ENERGYSTARScore above or equal to 90 should be recoded as “HIGH”. Create a new column named Energyscore_cat in the seattle_data_subset to save the recoded results.

#coerce the Energyscore_cat to factors, and order the factors by “LOW”, “MEDIUM”, “HIGH”.

seattle_data_subset <- seattle_data_subset %>%
  mutate(Energyscore_cat = factor(
    ifelse(ENERGYSTARScore < 60, "LOW",
           ifelse(ENERGYSTARScore < 90, "MEDIUM",  "HIGH")),
    levels = c("LOW", "MEDIUM", "HIGH"),
    ordered = TRUE))

#Calculate the percentage of each ENERGYSTARScore_cat level. Round results to 2 decimal places.

score_percentage <- seattle_data_subset %>%
  count(Energyscore_cat) %>%
  mutate(Percentage = round(n / sum(n) * 100, 2))

#Draw a pie chart to visualize the distribution of the ENERGYSTARScore_cat. Include labels and a main title.

pie(score_percentage$Percentage, labels = paste(score_percentage$Energyscore_cat, ":", score_percentage$Percentage, "%"),
    main = "ENERGYSTARScore Category Distibution Percentages")

##Import our textbook dataset “UZA.csv” and complete the following tasks.

uza_data <- read.csv("./data/UZA.csv")

#Use ggplot for this question. Draw a histogram of the pop000. Set the number of bins to 10. Choose any color and any title as you like.

ggplot(uza_data, aes(x = pop000)) +
  geom_histogram(bins = 10, fill = "purple", color = "gold") +
  labs(title = "Histogram of Population", x = "Population in Thousands (000s)", y = "Count")

#Use ggplot for this question. Draw a scatterplot between tfreq and compact, and put tfreq on y-axis. Choose any color and any title as you like.

ggplot(uza_data, aes(x = compact, y = tfreq)) +
  geom_point(color = "purple") +
  labs(title = "Transit Frequency & Compactness", x = "Compactness", y = "Transit Frequency")

Assignment 2

Mitch Roers

2024-10-29