Abstract : The purpose of this study is to explore, analyze, and discuss findings from the dataset “Apartment Building Registration” found in opendatatoronto. By first analyzing two variables individually, year_of_built and available_amenities_list, we then compare our variables’ analyses to identify if there is a relationship of amenity offerings in apartment buildings around Toronto, particularly over recent years.
License: MIT License. \(~\)
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
\(~\)
library(rvest)
## Loading required package: xml2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.3 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(opendatatoronto)
library(dplyr)
library(readr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(formattable)
library(wordcloud2)
library(knitr)
Purpose: This script gets data from opentorontodata directly (CRAN.Finding) and downloading data using opendatatoronto
library(opendatatoronto)
library(dplyr)
\(~\)
Apartment_packages <- search_packages("Apartment")
\(~\)
Apartment_packages
## # A tibble: 2 x 10
## title id topics civic_issues excerpt dataset_category num_resources formats
## <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr>
## 1 Apar… 4ef8… Locat… Affordable … "This … Table 1 XML,JS…
## 2 Apar… 2b98… Busin… Affordable … "This … Table 1 XML,JS…
## # … with 2 more variables: refresh_rate <chr>, last_refreshed <date>
\(~\)
Apartment_packages$title
## [1] "Apartment Building Evaluation" "Apartment Building Registration"
\(~\)
Apartment_Building_Registration <- Apartment_packages %>%
filter(title == "Apartment Building Registration")
\(~\)
Apartment_Building_Registration %>%
glimpse()
## Rows: 1
## Columns: 10
## $ title <chr> "Apartment Building Registration"
## $ id <chr> "2b98b3f3-4f3a-42a4-a4e9-b44d3026595a"
## $ topics <chr> "Business,City government,Development and infrastruc…
## $ civic_issues <chr> "Affordable housing,Poverty reduction"
## $ excerpt <chr> "This dataset contains building information for rent…
## $ dataset_category <chr> "Table"
## $ num_resources <int> 1
## $ formats <chr> "XML,JSON,CSV"
## $ refresh_rate <chr> "Monthly"
## $ last_refreshed <date> 2020-05-06
\(~\)
library(readr)
Apartment_Building_Registration_Data <- read_csv("~/Desktop/INF2178/Problem Set 2/input/Apartment Building Registration Data.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `_id` = col_double(),
## CONFIRMED_STOREYS = col_double(),
## CONFIRMED_UNITS = col_double(),
## HEATING_EQUIPMENT_YEAR_INSTALLED = col_double(),
## NO_BARRIER_FREE_ACCESSBLE_UNITS = col_double(),
## NO_OF_ACCESSIBLE_PARKING_SPACES = col_double(),
## NO_OF_ELEVATORS = col_double(),
## NO_OF_LAUNDRY_ROOM_MACHINES = col_double(),
## RSN = col_double(),
## SPRINKLER_SYSTEM_YEAR_INSTALLED = col_double(),
## YEAR_BUILT = col_double(),
## YEAR_OF_REPLACEMENT = col_double(),
## YEAR_REGISTERED = col_double()
## )
## See spec(...) for full column specifications.
View(Apartment_Building_Registration_Data)
Purpose: This script cleans data from opentorontodata directly (CRAN.Finding) and downloading data using opendatatoronto
Apartment_Building_Registration %>%
glimpse()
## Rows: 1
## Columns: 10
## $ title <chr> "Apartment Building Registration"
## $ id <chr> "2b98b3f3-4f3a-42a4-a4e9-b44d3026595a"
## $ topics <chr> "Business,City government,Development and infrastruc…
## $ civic_issues <chr> "Affordable housing,Poverty reduction"
## $ excerpt <chr> "This dataset contains building information for rent…
## $ dataset_category <chr> "Table"
## $ num_resources <int> 1
## $ formats <chr> "XML,JSON,CSV"
## $ refresh_rate <chr> "Monthly"
## $ last_refreshed <date> 2020-05-06
\(~\) #### Remove na in r - remove rows - na.omit function / option
Data <-Apartment_Building_Registration_Data[-which(is.na(Apartment_Building_Registration_Data$YEAR_BUILT)),]
\(~\)
sum(is.na(Apartment_Building_Registration_Data$YEAR_BUILT))
## [1] 12
Data #Apartment_Building_Registration_Data without NA in year built
## # A tibble: 3,438 x 62
## `_id` AIR_CONDITIONIN… AMENITIES_AVAIL… ANNUAL_FIRE_ALA… ANNUAL_FIRE_PUM…
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 NONE Indoor recreati… YES YES
## 2 2 NONE <NA> YES NO
## 3 3 NONE <NA> YES YES
## 4 4 NONE Indoor exercise… YES YES
## 5 5 NONE Outdoor pool , … YES YES
## 6 6 CENTRAL AIR Outdoor pool YES YES
## 7 7 NONE <NA> YES YES
## 8 8 NONE Outdoor rec fac… YES YES
## 9 9 NONE <NA> YES YES
## 10 10 NONE Outdoor pool YES YES
## # … with 3,428 more rows, and 57 more variables:
## # APPROVED_FIRE_SAFETY_PLAN <chr>, BALCONIES <chr>,
## # BARRIER_FREE_ACCESSIBILTY_ENTR <chr>, BIKE_PARKING <chr>,
## # CONFIRMED_STOREYS <dbl>, CONFIRMED_UNITS <dbl>,
## # DATE_OF_LAST_INSPECTION_BY_TSSA <chr>,
## # DESCRIPTION_OF_CHILD_PLAY_AREA <chr>,
## # DESCRIPTION_OF_INDOOR_EXERCISE_ROOM <chr>,
## # DESCRIPTION_OF_OUTDOOR_REC_FACILITIES <chr>, ELEVATOR_PARTS_REPLACED <chr>,
## # ELEVATOR_STATUS <chr>, EMERG_POWER_SUPPLY_TEST_RECORDS <chr>,
## # EXTERIOR_FIRE_ESCAPE <chr>, FACILITIES_AVAILABLE <chr>, FIRE_ALARM <chr>,
## # GARBAGE_CHUTES <chr>, GREEN_BIN_LOCATION <chr>,
## # HEATING_EQUIPMENT_STATUS <chr>, HEATING_EQUIPMENT_YEAR_INSTALLED <dbl>,
## # HEATING_TYPE <chr>, INDOOR_GARBAGE_STORAGE_AREA <chr>, INTERCOM <chr>,
## # IS_THERE_A_COOLING_ROOM <chr>, IS_THERE_EMERGENCY_POWER <chr>,
## # LAUNDRY_ROOM <chr>, LAUNDRY_ROOM_HOURS_OF_OPERATION <chr>,
## # LAUNDRY_ROOM_LOCATION <chr>, LOCKER_OR_STORAGE_ROOM <chr>,
## # NO_BARRIER_FREE_ACCESSBLE_UNITS <dbl>,
## # NO_OF_ACCESSIBLE_PARKING_SPACES <dbl>, NO_OF_ELEVATORS <dbl>,
## # NO_OF_LAUNDRY_ROOM_MACHINES <dbl>, NON_SMOKING_BUILDING <chr>,
## # OUTDOOR_GARBAGE_STORAGE_AREA <chr>, PARKING_TYPE <chr>, PCODE <chr>,
## # PET_RESTRICTIONS <chr>, PETS_ALLOWED <chr>,
## # PROP_MANAGEMENT_COMPANY_NAME <chr>, PROPERTY_TYPE <chr>,
## # RECYCLING_BINS_LOCATION <chr>, RSN <dbl>,
## # SEPARATE_GAS_METERS_EACH_UNIT <chr>, SEPARATE_HYDRO_METER_EACH_UNIT <chr>,
## # SEPARATE_WATER_METERS_EA_UNIT <chr>, SITE_ADDRESS <chr>,
## # SPRINKLER_SYSTEM <chr>, SPRINKLER_SYSTEM_TEST_RECORD <chr>,
## # SPRINKLER_SYSTEM_YEAR_INSTALLED <dbl>, TSSA_TEST_RECORDS <chr>,
## # VISITOR_PARKING <chr>, WARD <chr>, WINDOW_TYPE <chr>, YEAR_BUILT <dbl>,
## # YEAR_OF_REPLACEMENT <dbl>, YEAR_REGISTERED <dbl>
range(Data$YEAR_BUILT)
## [1] 1805 2019
\(~\)
Year <- table(Data$YEAR_BUILT)
duration <- Data$YEAR_BUILT
range(duration) ## check the range
## [1] 1805 2019
breaks <- seq(1800, 2020, by=10) ##create intervals of 10 years from the min to max values
duration.cut <- cut(duration, breaks, right=FALSE)
#options(duration.cut,scipen = 999)#Modify R Options to Disable Scientific Notation
duration.freq <- table(duration.cut) ## frequency of apartments builts in each sub interval
duration.freq
## duration.cut
## [1.8e+03,1.81e+03) [1.81e+03,1.82e+03) [1.82e+03,1.83e+03) [1.83e+03,1.84e+03)
## 2 0 0 1
## [1.84e+03,1.85e+03) [1.85e+03,1.86e+03) [1.86e+03,1.87e+03) [1.87e+03,1.88e+03)
## 0 0 0 0
## [1.88e+03,1.89e+03) [1.89e+03,1.9e+03) [1.9e+03,1.91e+03) [1.91e+03,1.92e+03)
## 4 6 23 55
## [1.92e+03,1.93e+03) [1.93e+03,1.94e+03) [1.94e+03,1.95e+03) [1.95e+03,1.96e+03)
## 134 101 94 904
## [1.96e+03,1.97e+03) [1.97e+03,1.98e+03) [1.98e+03,1.99e+03) [1.99e+03,2e+03)
## 1193 502 158 151
## [2e+03,2.01e+03) [2.01e+03,2.02e+03)
## 53 57
duration.freq <- cbind.data.frame(duration.freq) ## to have it in matrix format
duration.freq
## duration.cut Freq
## 1 [1.8e+03,1.81e+03) 2
## 2 [1.81e+03,1.82e+03) 0
## 3 [1.82e+03,1.83e+03) 0
## 4 [1.83e+03,1.84e+03) 1
## 5 [1.84e+03,1.85e+03) 0
## 6 [1.85e+03,1.86e+03) 0
## 7 [1.86e+03,1.87e+03) 0
## 8 [1.87e+03,1.88e+03) 0
## 9 [1.88e+03,1.89e+03) 4
## 10 [1.89e+03,1.9e+03) 6
## 11 [1.9e+03,1.91e+03) 23
## 12 [1.91e+03,1.92e+03) 55
## 13 [1.92e+03,1.93e+03) 134
## 14 [1.93e+03,1.94e+03) 101
## 15 [1.94e+03,1.95e+03) 94
## 16 [1.95e+03,1.96e+03) 904
## 17 [1.96e+03,1.97e+03) 1193
## 18 [1.97e+03,1.98e+03) 502
## 19 [1.98e+03,1.99e+03) 158
## 20 [1.99e+03,2e+03) 151
## 21 [2e+03,2.01e+03) 53
## 22 [2.01e+03,2.02e+03) 57
\(~\)
duration <- Data$YEAR_BUILT
breaks <- seq(1800, 2020, by=10) ##create intervals of 10 years from the min to max values
breaks
## [1] 1800 1810 1820 1830 1840 1850 1860 1870 1880 1890 1900 1910 1920 1930 1940
## [16] 1950 1960 1970 1980 1990 2000 2010 2020
duration.cut <- cut(duration, breaks, right=FALSE)
#duration.cut
duration.freq <- table(duration.cut) ##notice duration.freq is a table not dataframe##
duration.freq
## duration.cut
## [1.8e+03,1.81e+03) [1.81e+03,1.82e+03) [1.82e+03,1.83e+03) [1.83e+03,1.84e+03)
## 2 0 0 1
## [1.84e+03,1.85e+03) [1.85e+03,1.86e+03) [1.86e+03,1.87e+03) [1.87e+03,1.88e+03)
## 0 0 0 0
## [1.88e+03,1.89e+03) [1.89e+03,1.9e+03) [1.9e+03,1.91e+03) [1.91e+03,1.92e+03)
## 4 6 23 55
## [1.92e+03,1.93e+03) [1.93e+03,1.94e+03) [1.94e+03,1.95e+03) [1.95e+03,1.96e+03)
## 134 101 94 904
## [1.96e+03,1.97e+03) [1.97e+03,1.98e+03) [1.98e+03,1.99e+03) [1.99e+03,2e+03)
## 1193 502 158 151
## [2e+03,2.01e+03) [2.01e+03,2.02e+03)
## 53 57
duration.freq <- cbind.data.frame(duration.freq) ##to have the results in column format##
duration.freq
## duration.cut Freq
## 1 [1.8e+03,1.81e+03) 2
## 2 [1.81e+03,1.82e+03) 0
## 3 [1.82e+03,1.83e+03) 0
## 4 [1.83e+03,1.84e+03) 1
## 5 [1.84e+03,1.85e+03) 0
## 6 [1.85e+03,1.86e+03) 0
## 7 [1.86e+03,1.87e+03) 0
## 8 [1.87e+03,1.88e+03) 0
## 9 [1.88e+03,1.89e+03) 4
## 10 [1.89e+03,1.9e+03) 6
## 11 [1.9e+03,1.91e+03) 23
## 12 [1.91e+03,1.92e+03) 55
## 13 [1.92e+03,1.93e+03) 134
## 14 [1.93e+03,1.94e+03) 101
## 15 [1.94e+03,1.95e+03) 94
## 16 [1.95e+03,1.96e+03) 904
## 17 [1.96e+03,1.97e+03) 1193
## 18 [1.97e+03,1.98e+03) 502
## 19 [1.98e+03,1.99e+03) 158
## 20 [1.99e+03,2e+03) 151
## 21 [2e+03,2.01e+03) 53
## 22 [2.01e+03,2.02e+03) 57
\(~\)
Data_Study <- subset(Data, select = c("YEAR_BUILT", "AMENITIES_AVAILABLE"))
Data_Study
## # A tibble: 3,438 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1992 Indoor recreation room
## 2 1951 <NA>
## 3 1969 <NA>
## 4 1979 Indoor exercise room , Child play area
## 5 1970 Outdoor pool , Child play area
## 6 1974 Outdoor pool
## 7 1957 <NA>
## 8 1970 Outdoor rec facilities , Child play area
## 9 1970 <NA>
## 10 1972 Outdoor pool
## # … with 3,428 more rows
\(~\)
library(janitor)
library(tidyr)
amenities_available_list <- Data %>%
clean_names() %>%
select(id, amenities_available) %>%
mutate(n_buildings = n_distinct(id)) %>%
separate_rows(amenities_available, sep = " , ")
amenities_available_list #list of amenities in all apartments
## # A tibble: 4,189 x 3
## id amenities_available n_buildings
## <dbl> <chr> <int>
## 1 1 Indoor recreation room 3438
## 2 2 <NA> 3438
## 3 3 <NA> 3438
## 4 4 Indoor exercise room 3438
## 5 4 Child play area 3438
## 6 5 Outdoor pool 3438
## 7 5 Child play area 3438
## 8 6 Outdoor pool 3438
## 9 7 <NA> 3438
## 10 8 Outdoor rec facilities 3438
## # … with 4,179 more rows
#Change NA to None for amenities#
amenities_available_list$amenities_available[is.na(amenities_available_list$amenities_available)] <- "None"
amenities_available_list
## # A tibble: 4,189 x 3
## id amenities_available n_buildings
## <dbl> <chr> <int>
## 1 1 Indoor recreation room 3438
## 2 2 None 3438
## 3 3 None 3438
## 4 4 Indoor exercise room 3438
## 5 4 Child play area 3438
## 6 5 Outdoor pool 3438
## 7 5 Child play area 3438
## 8 6 Outdoor pool 3438
## 9 7 None 3438
## 10 8 Outdoor rec facilities 3438
## # … with 4,179 more rows
#### Save the data ####
#write_csv(amenities_available_list, "Amenities_available_list.csv") #data for amenties availibale list
#write_csv(Data,"Data.csv") #data without na in year built
#write_csv(Data_Study,"Data_Study.csv") #data with only years and amenties
Purpose: This script conducts exploratory data analysis of data that was downloaded in 01-get_data.R and cleaned in 02_clean_data.R
duration.freq$Percentage= 100*(duration.freq$Freq/sum(duration.freq$Fre))
duration.freq
## duration.cut Freq Percentage
## 1 [1.8e+03,1.81e+03) 2 0.05817336
## 2 [1.81e+03,1.82e+03) 0 0.00000000
## 3 [1.82e+03,1.83e+03) 0 0.00000000
## 4 [1.83e+03,1.84e+03) 1 0.02908668
## 5 [1.84e+03,1.85e+03) 0 0.00000000
## 6 [1.85e+03,1.86e+03) 0 0.00000000
## 7 [1.86e+03,1.87e+03) 0 0.00000000
## 8 [1.87e+03,1.88e+03) 0 0.00000000
## 9 [1.88e+03,1.89e+03) 4 0.11634671
## 10 [1.89e+03,1.9e+03) 6 0.17452007
## 11 [1.9e+03,1.91e+03) 23 0.66899360
## 12 [1.91e+03,1.92e+03) 55 1.59976731
## 13 [1.92e+03,1.93e+03) 134 3.89761489
## 14 [1.93e+03,1.94e+03) 101 2.93775451
## 15 [1.94e+03,1.95e+03) 94 2.73414776
## 16 [1.95e+03,1.96e+03) 904 26.29435718
## 17 [1.96e+03,1.97e+03) 1193 34.70040721
## 18 [1.97e+03,1.98e+03) 502 14.60151251
## 19 [1.98e+03,1.99e+03) 158 4.59569517
## 20 [1.99e+03,2e+03) 151 4.39208842
## 21 [2e+03,2.01e+03) 53 1.54159395
## 22 [2.01e+03,2.02e+03) 57 1.65794066
#Cumulative percentage
duration.freq$Cumulative_Percentage= 100*cumsum(duration.freq$Freq)/sum(duration.freq$Freq)
duration.freq
## duration.cut Freq Percentage Cumulative_Percentage
## 1 [1.8e+03,1.81e+03) 2 0.05817336 0.05817336
## 2 [1.81e+03,1.82e+03) 0 0.00000000 0.05817336
## 3 [1.82e+03,1.83e+03) 0 0.00000000 0.05817336
## 4 [1.83e+03,1.84e+03) 1 0.02908668 0.08726003
## 5 [1.84e+03,1.85e+03) 0 0.00000000 0.08726003
## 6 [1.85e+03,1.86e+03) 0 0.00000000 0.08726003
## 7 [1.86e+03,1.87e+03) 0 0.00000000 0.08726003
## 8 [1.87e+03,1.88e+03) 0 0.00000000 0.08726003
## 9 [1.88e+03,1.89e+03) 4 0.11634671 0.20360675
## 10 [1.89e+03,1.9e+03) 6 0.17452007 0.37812682
## 11 [1.9e+03,1.91e+03) 23 0.66899360 1.04712042
## 12 [1.91e+03,1.92e+03) 55 1.59976731 2.64688773
## 13 [1.92e+03,1.93e+03) 134 3.89761489 6.54450262
## 14 [1.93e+03,1.94e+03) 101 2.93775451 9.48225713
## 15 [1.94e+03,1.95e+03) 94 2.73414776 12.21640489
## 16 [1.95e+03,1.96e+03) 904 26.29435718 38.51076207
## 17 [1.96e+03,1.97e+03) 1193 34.70040721 73.21116928
## 18 [1.97e+03,1.98e+03) 502 14.60151251 87.81268179
## 19 [1.98e+03,1.99e+03) 158 4.59569517 92.40837696
## 20 [1.99e+03,2e+03) 151 4.39208842 96.80046539
## 21 [2e+03,2.01e+03) 53 1.54159395 98.34205934
## 22 [2.01e+03,2.02e+03) 57 1.65794066 100.00000000
#Change Names#
names(duration.freq)[1] <- "Years"
names(duration.freq)[2] <- "Apartments Built"
names(duration.freq)[4] <- "Cumulative Percentage"
duration.freq
## Years Apartments Built Percentage Cumulative Percentage
## 1 [1.8e+03,1.81e+03) 2 0.05817336 0.05817336
## 2 [1.81e+03,1.82e+03) 0 0.00000000 0.05817336
## 3 [1.82e+03,1.83e+03) 0 0.00000000 0.05817336
## 4 [1.83e+03,1.84e+03) 1 0.02908668 0.08726003
## 5 [1.84e+03,1.85e+03) 0 0.00000000 0.08726003
## 6 [1.85e+03,1.86e+03) 0 0.00000000 0.08726003
## 7 [1.86e+03,1.87e+03) 0 0.00000000 0.08726003
## 8 [1.87e+03,1.88e+03) 0 0.00000000 0.08726003
## 9 [1.88e+03,1.89e+03) 4 0.11634671 0.20360675
## 10 [1.89e+03,1.9e+03) 6 0.17452007 0.37812682
## 11 [1.9e+03,1.91e+03) 23 0.66899360 1.04712042
## 12 [1.91e+03,1.92e+03) 55 1.59976731 2.64688773
## 13 [1.92e+03,1.93e+03) 134 3.89761489 6.54450262
## 14 [1.93e+03,1.94e+03) 101 2.93775451 9.48225713
## 15 [1.94e+03,1.95e+03) 94 2.73414776 12.21640489
## 16 [1.95e+03,1.96e+03) 904 26.29435718 38.51076207
## 17 [1.96e+03,1.97e+03) 1193 34.70040721 73.21116928
## 18 [1.97e+03,1.98e+03) 502 14.60151251 87.81268179
## 19 [1.98e+03,1.99e+03) 158 4.59569517 92.40837696
## 20 [1.99e+03,2e+03) 151 4.39208842 96.80046539
## 21 [2e+03,2.01e+03) 53 1.54159395 98.34205934
## 22 [2.01e+03,2.02e+03) 57 1.65794066 100.00000000
#Modify global options in R
#options(scipen = 999)#
#ormat(duration.freq$year, scientific=F)#
\(~\)
library(janitor)
library(tidyr)
amenities <- amenities_available_list %>%
clean_names() %>%
select(id, amenities_available) %>%
mutate(n_buildings = n_distinct(id)) %>%
separate_rows(amenities_available, sep = " , ")
amenities
amenities <- amenities %>%
count(amenities_available, n_buildings) %>%
mutate(
amenities_available = ifelse(is.na(amenities_available), "None", amenities_available),
prop = n / n_buildings
)
amenities
### To sort the data frame in descending order by n###
amenities <-amenities[order(-amenities$n),]
amenities
#### Summary stats ####
skimr::skim(amenities)
summary(amenities)
Purpose: This script conducts exploratory data analysis of data that was downloaded in 01-get_data.R and cleaned in 02_clean_data.R.
\(~\)
x <- Data$YEAR_BUILT
h<-hist(x, breaks=10, col="Yellow", xlab="Duration Years from 1800 to 2020",
ylab="Apartments Built",
main="Apartments Built from 1800 to 2020")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="Red", lwd=2)
\(~\)
library(formattable)
duration.freq$Percentage= 100*(duration.freq$Freq/sum(duration.freq$Fre))
duration.freq
#Cumulative percentage
duration.freq$Cumulative_Percentage= 100*cumsum(duration.freq$Freq)/sum(duration.freq$Freq)
duration.freq
#Change Names#
names(duration.freq)[1] <- "Years"
names(duration.freq)[2] <- "Apartments Built"
names(duration.freq)[4] <- "Cumulative Percentage"
duration.freq
#Modify global options in R
#options(scipen = 999)#
#ormat(duration.freq$year, scientific=F)#
###A fine table
formattable(duration.freq)
\(~\)
Amenities_Freq<-subset(amenities, select = c("amenities_available", "prop"))
Amenities_Freq
#Use wordcloud directly:
library(wordcloud2)
wordcloud2(Amenities_Freq,color = "random-dark", backgroundColor = "White")
\(~\)
library(ggplot2)
ggplot(amenities, aes(x = 2, y = prop, fill = amenities_available)) +
geom_bar(stat = "identity", color = "white") +
geom_text(aes(label = paste(round(prop * 100, 1), "%")),size=2, position = position_stack(vjust = 0.5)) +
coord_polar(theta = "y", start = 0)+
theme_void()+
xlim(0.5, 2.5)+
labs(
title = "Amenities Available in Toronto Apartment Buildings",
caption = "Source: Apartment Building Registration (Toronto Open Data)",
fill = "Amenities Available"
)
\(~\)
library(dplyr)
None_Amenity <- Data_Study[is.na(Data_Study$AMENITIES_AVAILABLE),]
None_Amenity
#Frequency table#
None_Amenitytable<-ftable(None_Amenity)
#Convert Tables to a Data Frame #
None_Amenity.df <- as.data.frame(None_Amenitytable)
#Change the factor to numeric#
None_Amenity.df$YEAR_BUILT=as.numeric(levels(None_Amenity.df$YEAR_BUILT))[None_Amenity.df$YEAR_BUILT]
#Plot#
None_Amenity_Plot=ggplot(None_Amenity.df, aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="None Amenity Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="None Amenity Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
None_Amenity_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
\(~\)
library(dplyr)
Indoor_Recreation_Room <- filter(Data_Study,AMENITIES_AVAILABLE == "Indoor recreation room")
Indoor_Recreation_Room
## # A tibble: 251 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1992 Indoor recreation room
## 2 1974 Indoor recreation room
## 3 1986 Indoor recreation room
## 4 2001 Indoor recreation room
## 5 1984 Indoor recreation room
## 6 1956 Indoor recreation room
## 7 1941 Indoor recreation room
## 8 1995 Indoor recreation room
## 9 1973 Indoor recreation room
## 10 1985 Indoor recreation room
## # … with 241 more rows
#Frequency table#
Indoor_Recreation_Roomtable<-ftable(Indoor_Recreation_Room)
#Convert Tables to a Data Frame #
Indoor_Recreation_Room.df <- as.data.frame(Indoor_Recreation_Roomtable)
#Change the factor to numeric#
Indoor_Recreation_Room.df$YEAR_BUILT=as.numeric(levels(Indoor_Recreation_Room.df$YEAR_BUILT))[Indoor_Recreation_Room.df$YEAR_BUILT]
#Plot#
Indoor_Recreation_Room_Plot=ggplot(Indoor_Recreation_Room.df, aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Indoor Recreation Room Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Indoor Recreation Room Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Indoor_Recreation_Room_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
\(~\)
library(dplyr)
Child_play_area <- filter(Data_Study,AMENITIES_AVAILABLE == "Child play area")
Child_play_area
## # A tibble: 92 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1964 Child play area
## 2 1964 Child play area
## 3 1960 Child play area
## 4 1964 Child play area
## 5 1965 Child play area
## 6 1962 Child play area
## 7 1965 Child play area
## 8 1970 Child play area
## 9 1972 Child play area
## 10 1966 Child play area
## # … with 82 more rows
#Frequency table#
Childtable<-ftable(Child_play_area)
#Convert Tables to a Data Frame #
Child.df <- as.data.frame(Childtable)
#Change the factor to numeric#
Child.df$YEAR_BUILT=as.numeric(levels(Child.df$YEAR_BUILT))[Child.df$YEAR_BUILT]
#Plot#
Child_Plot=ggplot(Child.df, aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Child Play Area Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Child Play Area Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Child_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
\(~\)
library(dplyr)
Indoor_Exercise_Room <- filter(Data_Study,AMENITIES_AVAILABLE == "Indoor exercise room")
Indoor_Exercise_Room
## # A tibble: 35 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1953 Indoor exercise room
## 2 1972 Indoor exercise room
## 3 1969 Indoor exercise room
## 4 2005 Indoor exercise room
## 5 1960 Indoor exercise room
## 6 1969 Indoor exercise room
## 7 2010 Indoor exercise room
## 8 1965 Indoor exercise room
## 9 2017 Indoor exercise room
## 10 1970 Indoor exercise room
## # … with 25 more rows
#Frequency table#
Indoor_Exercise_Roomtable<-ftable(Indoor_Exercise_Room)
#Convert Tables to a Data Frame #
Indoor_Exercise_Room.df <- as.data.frame(Indoor_Exercise_Roomtable)
#Change the factor to numeric#
Indoor_Exercise_Room.df$YEAR_BUILT=as.numeric(levels(Indoor_Exercise_Room.df$YEAR_BUILT))[Indoor_Exercise_Room.df$YEAR_BUILT]
#Plot#
Indoor_Exercise_Room_Plot=ggplot(Indoor_Exercise_Room.df, aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Indoor Exercise Room Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Indoor Exercise Room Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Indoor_Exercise_Room_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
\(~\)
library(dplyr)
Outdoor_rec_facilities <- filter(Data_Study,AMENITIES_AVAILABLE == "Outdoor rec facilities")
Outdoor_rec_facilities
## # A tibble: 34 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1954 Outdoor rec facilities
## 2 1994 Outdoor rec facilities
## 3 1928 Outdoor rec facilities
## 4 1954 Outdoor rec facilities
## 5 1910 Outdoor rec facilities
## 6 1959 Outdoor rec facilities
## 7 1971 Outdoor rec facilities
## 8 1959 Outdoor rec facilities
## 9 1991 Outdoor rec facilities
## 10 1967 Outdoor rec facilities
## # … with 24 more rows
#Frequency table#
Outdoor_rec_facilitiestable<-ftable(Indoor_Exercise_Room)
#Convert Tables to a Data Frame #
Outdoor_rec_facilities.df <- as.data.frame(Outdoor_rec_facilitiestable)
#Change the factor to numeric#
Outdoor_rec_facilities.df$YEAR_BUILT=as.numeric(levels(Outdoor_rec_facilities.df$YEAR_BUILT))[Outdoor_rec_facilities.df$YEAR_BUILT]
#Plot#
Outdoor_rec_facilities_Plot=ggplot(Outdoor_rec_facilities.df, aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Outdoor Rec Facilities Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Outdoor Rec Facilities Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Outdoor_rec_facilities_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
\(~\)
library(dplyr)
Outdoor_pool <- filter(Data_Study,AMENITIES_AVAILABLE == "Outdoor pool")
Outdoor_pool
## # A tibble: 88 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1974 Outdoor pool
## 2 1972 Outdoor pool
## 3 1968 Outdoor pool
## 4 1964 Outdoor pool
## 5 1962 Outdoor pool
## 6 1969 Outdoor pool
## 7 1956 Outdoor pool
## 8 1971 Outdoor pool
## 9 1963 Outdoor pool
## 10 1964 Outdoor pool
## # … with 78 more rows
#Frequency table#
Outdoor_pooltable<-ftable(Outdoor_pool)
Outdoor_pool.df <- as.data.frame(Outdoor_pooltable)
#Change the factor to numeric#
Outdoor_pool.df$YEAR_BUILT=as.numeric(levels(Outdoor_pool.df$YEAR_BUILT))[Outdoor_pool.df$YEAR_BUILT]
#Plot#
Outdoor_pool_Plot=ggplot(Outdoor_pool.df,aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Outdoor Pool Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Outdoor Pool Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Outdoor_pool_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
\(~\)
library(dplyr)
Sauna <- filter(Data_Study,AMENITIES_AVAILABLE == "Sauna")
Sauna
## # A tibble: 4 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1966 Sauna
## 2 1957 Sauna
## 3 1970 Sauna
## 4 1968 Sauna
#Frequency table#
Saunatable<-ftable(Sauna)
Sauna.df <- as.data.frame(Saunatable)
#Change the factor to numeric#
Sauna.df$YEAR_BUILT=as.numeric(levels(Sauna.df$YEAR_BUILT))[Sauna.df$YEAR_BUILT]
#Plot#
Sauna_Plot=ggplot(Sauna.df,aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Sauna Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Sauna Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Sauna_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1956.9
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 11.065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 16.524
\(~\)
library(dplyr)
Indoor_pool <- filter(Data_Study,AMENITIES_AVAILABLE == "Indoor pool")
Indoor_pool
## # A tibble: 7 x 2
## YEAR_BUILT AMENITIES_AVAILABLE
## <dbl> <chr>
## 1 1973 Indoor pool
## 2 1960 Indoor pool
## 3 1974 Indoor pool
## 4 1964 Indoor pool
## 5 1970 Indoor pool
## 6 1970 Indoor pool
## 7 1969 Indoor pool
#Frequency table#
Indoor_pooltable <-ftable(Indoor_pool)
Indoor_pool.df <- as.data.frame(Indoor_pooltable)
#Change the factor to numeric#
Indoor_pool.df$YEAR_BUILT=as.numeric(levels(Indoor_pool.df$YEAR_BUILT))[Indoor_pool.df$YEAR_BUILT]
#Plot#
Indoor_pool_Plot=ggplot(Indoor_pool.df,aes(x = YEAR_BUILT, y = Freq))+
geom_smooth(se = FALSE, method = "loess", color = "grey30") +
geom_point(alpha = 0.5, col = "cornflowerblue", size = 3)+
geom_line(alpha = 0.5,col = "red", size = 1) +
labs(title="Indoor Pool Available in Toronto Apartment Buildings",subtitle= "From 1800 to 2020", y="Indoor Pool Available", x = "Years ",caption = "Source: Apartment Building Registration (Toronto Open Data)")
Indoor_pool_Plot + scale_x_continuous(breaks=seq(1800, 2020, 10))
## `geom_smooth()` using formula 'y ~ x'
\(~\) — \(~\)
Dawei Lang (2020). wordcloud2: Create Word Cloud by htmlWidget. R package version 0.2.2. https://github.com/lchiffon/wordcloud2
Fillas, Jessica Elizabeth (2019). 5 Purpose-Built Toronto Rentals that will Give You Serious Condo Envy. https://foxmarin.ca/purpose-built-toronto-rentals/
Hadley Wickham (2011). The Split-Apply-Combine Strategy for Data Analysis. Journal of Statistical Software, 40(1), 1-29. URL http://www.jstatsoft.org/v40/i01/.
Hadley Wickham (2019). rvest: Easily Harvest (Scrape) Web Pages. R package version 0.3.5. https://CRAN.R-project.org/package=rvest.
Kun Ren and Kenton Russell (2016). formattable: Create ‘Formattable’ Data Structures. R package version 0.2.0.1. https://CRAN.R-project.org/package=formattable.
Municipal Licensing and Standards (2020). Apartment Building Registration Data. [Data file]. https://open.toronto.ca/dataset/apartment-building-registration/
Sam Firke (2020). janitor: Simple Tools for Examining and Cleaning Dirty Data. R package version 1.2.1. https://CRAN.R-project.org/package=janitor.
Sharla Gelfand (2019). opendatatoronto: Access the City of Toronto Open Data Portal. R package version 0.1.1. https://CRAN.R-project.org/package=opendatatoronto.
Wickham et al., (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686.