Import Dataset

#Import tallestbuildings
library(readxl)
tallestbuildings<-read_excel("tallestbuildings.xlsx")

#Preview the dataset
tallestbuildings

## # A tibble: 100 x 9
##     Rank `Building Name` City  `Height (m)` `Height (ft)` Floors Completed
##    <dbl> <chr>           <chr>        <dbl>         <dbl>  <dbl>     <dbl>
##  1     1 Burj Khalifa    Duba…         828           2717    163      2010
##  2     2 Shanghai Tower  Shan…         632           2073    128      2015
##  3     3 Makkah Royal C… Mecc…         601           1972    120      2012
##  4     4 One World Trad… New …         541.          1776     94      2014
##  5     5 TAIPEI 101      Taip…         508           1667    101      2004
##  6     6 Shanghai World… Shan…         492           1614    101      2008
##  7     7 International … Hong…         484           1588    108      2010
##  8     8 Petronas Twin … Kual…         452.          1483     88      1998
##  9     8 Petronas Twin … Kual…         452.          1483     88      1998
## 10    10 Zifeng Tower    Nanj…         450           1476     66      2010
## # … with 90 more rows, and 2 more variables: Material <chr>, Use <chr>

Initially looking at what needs to be tidied:
- Separate City Name and Country Code
- Separate Materials
- Separate Uses

Run Libraries

library(tidyverse)

## ── Attaching packages ─────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)

library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(countrycode)

Tidy the Data

#Removing the space from column names
names(tallestbuildings) %>% normVarNames()

## [1] "rank"          "building_name" "city"          "height_m"     
## [5] "height_ft"     "floors"        "completed"     "material"     
## [9] "use"

#Separating the City and the Country
tcico <- tallestbuildings %>%
  separate(City, into = c("City", "Country"),sep = "[()]")

## Warning: Expected 2 pieces. Additional pieces discarded in 100 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

#Keeps the Country Code, but drops the ()

#Preview of dataset with separated city and country code
tcico

## # A tibble: 100 x 10
##     Rank `Building Name` City  Country `Height (m)` `Height (ft)` Floors
##    <dbl> <chr>           <chr> <chr>          <dbl>         <dbl>  <dbl>
##  1     1 Burj Khalifa    Duba… AE              828           2717    163
##  2     2 Shanghai Tower  Shan… CN              632           2073    128
##  3     3 Makkah Royal C… Mecc… SA              601           1972    120
##  4     4 One World Trad… New … US              541.          1776     94
##  5     5 TAIPEI 101      Taip… TW              508           1667    101
##  6     6 Shanghai World… Shan… CN              492           1614    101
##  7     7 International … Hong… CN              484           1588    108
##  8     8 Petronas Twin … Kual… MY              452.          1483     88
##  9     8 Petronas Twin … Kual… MY              452.          1483     88
## 10    10 Zifeng Tower    Nanj… CN              450           1476     66
## # … with 90 more rows, and 3 more variables: Completed <dbl>, Material <chr>,
## #   Use <chr>

Exercises

1. Display how many buildings are there in each city represented in that dataset. An arrangement in either an ascending or a descending order of number of buildings is always helpful.

CityBuildCount <- tcico %>%
  group_by(City) %>%
  summarise(Number=length(City))

ggplot(CityBuildCount,aes(reorder(City,Number),Number)) + geom_bar(stat="identity", fill = "blue") + coord_flip() +  
  labs(x="City", y='Count') + 
  ggtitle('Buildings in Each City')

ggplot(CityBuildCount,aes(reorder(City,Number),Number)) + geom_bar(stat="identity", fill = "blue") +  
  labs(x="City", y='Count') + 
  ggtitle('Buildings in Each City') +
  theme(axis.text.x = element_text(angle = 90))

2. Please plot the different cities in order of the mean height of buildings in a city.

#rename column Height (ft) for ease
colnames(tcico)[colnames(tcico) == "Height (ft)"] <- "Height_ft"

tcico

## # A tibble: 100 x 10
##     Rank `Building Name` City  Country `Height (m)` Height_ft Floors Completed
##    <dbl> <chr>           <chr> <chr>          <dbl>     <dbl>  <dbl>     <dbl>
##  1     1 Burj Khalifa    Duba… AE              828       2717    163      2010
##  2     2 Shanghai Tower  Shan… CN              632       2073    128      2015
##  3     3 Makkah Royal C… Mecc… SA              601       1972    120      2012
##  4     4 One World Trad… New … US              541.      1776     94      2014
##  5     5 TAIPEI 101      Taip… TW              508       1667    101      2004
##  6     6 Shanghai World… Shan… CN              492       1614    101      2008
##  7     7 International … Hong… CN              484       1588    108      2010
##  8     8 Petronas Twin … Kual… MY              452.      1483     88      1998
##  9     8 Petronas Twin … Kual… MY              452.      1483     88      1998
## 10    10 Zifeng Tower    Nanj… CN              450       1476     66      2010
## # … with 90 more rows, and 2 more variables: Material <chr>, Use <chr>

CityMeanFeet <- tcico %>%
  group_by(City) %>%
  summarise(number=mean(Height_ft))

ggplot(CityMeanFeet,aes(reorder(City,number),number)) + geom_bar(stat="identity", fill = "purple") +  coord_flip() +
  labs(x="City", y= "Feet") + 
  ggtitle('Mean Height of Buildings in City')

ggplot(CityMeanFeet,aes(reorder(City,number),number)) + geom_bar(stat="identity", fill = "purple") +  
  labs(x="City", y= "Feet") + 
  ggtitle('Mean Height of Buildings in City') +
  theme(axis.text.x = element_text(angle = 90))

#Had to add warn, custom_dict, custom_match, origin_regex otherwise had error for no matches with country code
tcico$Country = countrycode(tcico$Country, 'iso2c', 'country.name', warn = TRUE, custom_dict = NULL, custom_match = NULL, origin_regex = FALSE)

3. Redo #1 with country information

CountryBuildCount <- tcico %>%
  group_by(Country) %>%
  summarise(Number=length(Country))

ggplot(CountryBuildCount,aes(reorder(Country,Number),Number)) + geom_bar(stat="identity", fill = "orange") + coord_flip() +  
  labs(x="Country", y='Count') + 
  ggtitle('Buildings in Each Country')

ggplot(CountryBuildCount,aes(reorder(Country,Number),Number)) + geom_bar(stat="identity", fill = "orange") +  
  labs(x="Country", y='Count') + 
  ggtitle('Buildings in Each Country') +
  theme(axis.text.x = element_text(angle = 90))

4. Redo #2 with country information

CountryMeanFeet <- tcico %>%
  group_by(Country) %>%
  summarise(number=mean(Height_ft))

ggplot(CountryMeanFeet,aes(reorder(Country,number),number)) + geom_bar(stat="identity", fill = "pink") + coord_flip() +
  labs(x="Country", y= "Feet") + 
  ggtitle('Mean Height of Buildings in Country')

ggplot(CountryMeanFeet,aes(reorder(Country,number),number)) + geom_bar(stat="identity", fill = "pink") +  
  labs(x="Country", y= "Feet") + 
  ggtitle('Mean Height of Buildings in Country') +
  theme(axis.text.x = element_text(angle = 90))

5. In 4 above, you would’ve plot different countries in order of their mean height of a buildings in a city. If you have not used a bar graph there, please create a bar graph. In this bar graph, please color each bar for a country based on the number of buildings from this dataset that are present in that country.

#Combines the information from Problems 3 and 4
CountryMeanCount <- tcico %>%
  group_by(Country) %>%
  summarise(CountryMeanFeet=mean(Height_ft),CountryBuildCount=length(Country))

#CountryMeanFeet becomes the bars, CountryBuildCount becomes the fill color
ggplot(CountryMeanCount,aes(reorder(Country,CountryMeanFeet),CountryMeanFeet,fill=CountryBuildCount)) + geom_bar(stat="identity") +  
  labs(x="Country", y= "Feet", fill = "Count") + 
  ggtitle('Mean Height of Buildings in Country') +
  theme(axis.text.x = element_text(angle = 90))

ggplot(CountryMeanCount,aes(reorder(Country,CountryMeanFeet),CountryMeanFeet,fill=CountryBuildCount)) + geom_bar(stat="identity") + coord_flip() +
  labs(x="Country", y= "Feet", fill = "Count") + 
  ggtitle('Mean Height of Buildings in Country')

6. What are the mean heights (in feet) of buildings that are used for different purposes. (Here, you will have different purposes in a column and the corresponding mean height in a different column.) In computing this, it is okay to double or triple count a building if it has multiple uses.

UseMeanFeet <- tcico %>%
  group_by(Use) %>%
  summarise(number=mean(Height_ft))

These are the intial graphs of the mean in feet for different uses before separating into separate columns.

ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "red") +  coord_flip() +
  labs(x="Use", y= "Feet") + 
  ggtitle('Mean Height of Buildings in Country')

ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "red") +  
  labs(x="Use", y= "Feet") + 
  ggtitle('Mean Height of Buildings in Country') +
  theme(axis.text.x = element_text(angle = 90))

#Selected only the columns I needed to work with for ease
tuse <- tcico %>%
  select(Country, Height_ft, Use)

tuse

## # A tibble: 100 x 3
##    Country              Height_ft Use                         
##    <chr>                    <dbl> <chr>                       
##  1 United Arab Emirates      2717 office / residential / hotel
##  2 China                     2073 hotel / office              
##  3 Saudi Arabia              1972 other / hotel               
##  4 United States             1776 office                      
##  5 Taiwan                    1667 office                      
##  6 China                     1614 hotel / office              
##  7 China                     1588 hotel / office              
##  8 Malaysia                  1483 office                      
##  9 Malaysia                  1483 office                      
## 10 China                     1476 hotel / office              
## # … with 90 more rows

#Separating the Uses (office, residential, hotel, retail, serviced apartments, or other); 6 use options total

#Separated them at the / into 3 columns, because that's the most any building had, then pivoted longer to place the use into one column

#Dropped the NA values otherwise added a NA bar to the graph
UseMeanFeet <- tuse %>%
  separate(Use, c("U1","U2", "U3"),sep = ' / ') %>%
  pivot_longer(c('U1', 'U2','U3'), names_to = "Uses", values_to = "Use") %>%
  drop_na(Use) %>%
  group_by(Use) %>%
  summarise(number=mean(Height_ft))

## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 89 rows [2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].

UseMeanFeet

## # A tibble: 6 x 2
##   Use                 number
##   <chr>                <dbl>
## 1 hotel                1210.
## 2 office               1191.
## 3 other                1972 
## 4 residential          1150.
## 5 retail               1117 
## 6 serviced apartments  1118

Note: Intially I had done the separation as sep = ‘/’, which created duplicate columns due to the spacing. To fix this, I changed it to sep = ’ / ’, with spaces around the /. This combined the data into the 6 options for use.

ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "green", alpha=0.4) +  coord_flip() +
  labs(x="Uses", y= "Feet") + 
  ggtitle('Mean Height of Buildings in Country')

ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "green", alpha=0.4) +  
  labs(x="Uses", y= "Feet") + 
  ggtitle('Mean Height of Buildings in Country') +
  theme(axis.text.x = element_text(angle = 90))

Assignment 5

Marissa Beukelman

3/4/2020