#Import tallestbuildings
library(readxl)
tallestbuildings<-read_excel("tallestbuildings.xlsx")
#Preview the dataset
tallestbuildings
## # A tibble: 100 x 9
## Rank `Building Name` City `Height (m)` `Height (ft)` Floors Completed
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Burj Khalifa Duba… 828 2717 163 2010
## 2 2 Shanghai Tower Shan… 632 2073 128 2015
## 3 3 Makkah Royal C… Mecc… 601 1972 120 2012
## 4 4 One World Trad… New … 541. 1776 94 2014
## 5 5 TAIPEI 101 Taip… 508 1667 101 2004
## 6 6 Shanghai World… Shan… 492 1614 101 2008
## 7 7 International … Hong… 484 1588 108 2010
## 8 8 Petronas Twin … Kual… 452. 1483 88 1998
## 9 8 Petronas Twin … Kual… 452. 1483 88 1998
## 10 10 Zifeng Tower Nanj… 450 1476 66 2010
## # … with 90 more rows, and 2 more variables: Material <chr>, Use <chr>
library(tidyverse)
## ── Attaching packages ─────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(countrycode)
#Removing the space from column names
names(tallestbuildings) %>% normVarNames()
## [1] "rank" "building_name" "city" "height_m"
## [5] "height_ft" "floors" "completed" "material"
## [9] "use"
#Separating the City and the Country
tcico <- tallestbuildings %>%
separate(City, into = c("City", "Country"),sep = "[()]")
## Warning: Expected 2 pieces. Additional pieces discarded in 100 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#Keeps the Country Code, but drops the ()
#Preview of dataset with separated city and country code
tcico
## # A tibble: 100 x 10
## Rank `Building Name` City Country `Height (m)` `Height (ft)` Floors
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Burj Khalifa Duba… AE 828 2717 163
## 2 2 Shanghai Tower Shan… CN 632 2073 128
## 3 3 Makkah Royal C… Mecc… SA 601 1972 120
## 4 4 One World Trad… New … US 541. 1776 94
## 5 5 TAIPEI 101 Taip… TW 508 1667 101
## 6 6 Shanghai World… Shan… CN 492 1614 101
## 7 7 International … Hong… CN 484 1588 108
## 8 8 Petronas Twin … Kual… MY 452. 1483 88
## 9 8 Petronas Twin … Kual… MY 452. 1483 88
## 10 10 Zifeng Tower Nanj… CN 450 1476 66
## # … with 90 more rows, and 3 more variables: Completed <dbl>, Material <chr>,
## # Use <chr>
CityBuildCount <- tcico %>%
group_by(City) %>%
summarise(Number=length(City))
ggplot(CityBuildCount,aes(reorder(City,Number),Number)) + geom_bar(stat="identity", fill = "blue") + coord_flip() +
labs(x="City", y='Count') +
ggtitle('Buildings in Each City')
ggplot(CityBuildCount,aes(reorder(City,Number),Number)) + geom_bar(stat="identity", fill = "blue") +
labs(x="City", y='Count') +
ggtitle('Buildings in Each City') +
theme(axis.text.x = element_text(angle = 90))
#rename column Height (ft) for ease
colnames(tcico)[colnames(tcico) == "Height (ft)"] <- "Height_ft"
tcico
## # A tibble: 100 x 10
## Rank `Building Name` City Country `Height (m)` Height_ft Floors Completed
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Burj Khalifa Duba… AE 828 2717 163 2010
## 2 2 Shanghai Tower Shan… CN 632 2073 128 2015
## 3 3 Makkah Royal C… Mecc… SA 601 1972 120 2012
## 4 4 One World Trad… New … US 541. 1776 94 2014
## 5 5 TAIPEI 101 Taip… TW 508 1667 101 2004
## 6 6 Shanghai World… Shan… CN 492 1614 101 2008
## 7 7 International … Hong… CN 484 1588 108 2010
## 8 8 Petronas Twin … Kual… MY 452. 1483 88 1998
## 9 8 Petronas Twin … Kual… MY 452. 1483 88 1998
## 10 10 Zifeng Tower Nanj… CN 450 1476 66 2010
## # … with 90 more rows, and 2 more variables: Material <chr>, Use <chr>
CityMeanFeet <- tcico %>%
group_by(City) %>%
summarise(number=mean(Height_ft))
ggplot(CityMeanFeet,aes(reorder(City,number),number)) + geom_bar(stat="identity", fill = "purple") + coord_flip() +
labs(x="City", y= "Feet") +
ggtitle('Mean Height of Buildings in City')
ggplot(CityMeanFeet,aes(reorder(City,number),number)) + geom_bar(stat="identity", fill = "purple") +
labs(x="City", y= "Feet") +
ggtitle('Mean Height of Buildings in City') +
theme(axis.text.x = element_text(angle = 90))
#Had to add warn, custom_dict, custom_match, origin_regex otherwise had error for no matches with country code
tcico$Country = countrycode(tcico$Country, 'iso2c', 'country.name', warn = TRUE, custom_dict = NULL, custom_match = NULL, origin_regex = FALSE)
CountryBuildCount <- tcico %>%
group_by(Country) %>%
summarise(Number=length(Country))
ggplot(CountryBuildCount,aes(reorder(Country,Number),Number)) + geom_bar(stat="identity", fill = "orange") + coord_flip() +
labs(x="Country", y='Count') +
ggtitle('Buildings in Each Country')
ggplot(CountryBuildCount,aes(reorder(Country,Number),Number)) + geom_bar(stat="identity", fill = "orange") +
labs(x="Country", y='Count') +
ggtitle('Buildings in Each Country') +
theme(axis.text.x = element_text(angle = 90))
CountryMeanFeet <- tcico %>%
group_by(Country) %>%
summarise(number=mean(Height_ft))
ggplot(CountryMeanFeet,aes(reorder(Country,number),number)) + geom_bar(stat="identity", fill = "pink") + coord_flip() +
labs(x="Country", y= "Feet") +
ggtitle('Mean Height of Buildings in Country')
ggplot(CountryMeanFeet,aes(reorder(Country,number),number)) + geom_bar(stat="identity", fill = "pink") +
labs(x="Country", y= "Feet") +
ggtitle('Mean Height of Buildings in Country') +
theme(axis.text.x = element_text(angle = 90))
#Combines the information from Problems 3 and 4
CountryMeanCount <- tcico %>%
group_by(Country) %>%
summarise(CountryMeanFeet=mean(Height_ft),CountryBuildCount=length(Country))
#CountryMeanFeet becomes the bars, CountryBuildCount becomes the fill color
ggplot(CountryMeanCount,aes(reorder(Country,CountryMeanFeet),CountryMeanFeet,fill=CountryBuildCount)) + geom_bar(stat="identity") +
labs(x="Country", y= "Feet", fill = "Count") +
ggtitle('Mean Height of Buildings in Country') +
theme(axis.text.x = element_text(angle = 90))
ggplot(CountryMeanCount,aes(reorder(Country,CountryMeanFeet),CountryMeanFeet,fill=CountryBuildCount)) + geom_bar(stat="identity") + coord_flip() +
labs(x="Country", y= "Feet", fill = "Count") +
ggtitle('Mean Height of Buildings in Country')
UseMeanFeet <- tcico %>%
group_by(Use) %>%
summarise(number=mean(Height_ft))
These are the intial graphs of the mean in feet for different uses before separating into separate columns.
ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "red") + coord_flip() +
labs(x="Use", y= "Feet") +
ggtitle('Mean Height of Buildings in Country')
ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "red") +
labs(x="Use", y= "Feet") +
ggtitle('Mean Height of Buildings in Country') +
theme(axis.text.x = element_text(angle = 90))
#Selected only the columns I needed to work with for ease
tuse <- tcico %>%
select(Country, Height_ft, Use)
tuse
## # A tibble: 100 x 3
## Country Height_ft Use
## <chr> <dbl> <chr>
## 1 United Arab Emirates 2717 office / residential / hotel
## 2 China 2073 hotel / office
## 3 Saudi Arabia 1972 other / hotel
## 4 United States 1776 office
## 5 Taiwan 1667 office
## 6 China 1614 hotel / office
## 7 China 1588 hotel / office
## 8 Malaysia 1483 office
## 9 Malaysia 1483 office
## 10 China 1476 hotel / office
## # … with 90 more rows
#Separating the Uses (office, residential, hotel, retail, serviced apartments, or other); 6 use options total
#Separated them at the / into 3 columns, because that's the most any building had, then pivoted longer to place the use into one column
#Dropped the NA values otherwise added a NA bar to the graph
UseMeanFeet <- tuse %>%
separate(Use, c("U1","U2", "U3"),sep = ' / ') %>%
pivot_longer(c('U1', 'U2','U3'), names_to = "Uses", values_to = "Use") %>%
drop_na(Use) %>%
group_by(Use) %>%
summarise(number=mean(Height_ft))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 89 rows [2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
UseMeanFeet
## # A tibble: 6 x 2
## Use number
## <chr> <dbl>
## 1 hotel 1210.
## 2 office 1191.
## 3 other 1972
## 4 residential 1150.
## 5 retail 1117
## 6 serviced apartments 1118
Note: Intially I had done the separation as sep = ‘/’, which created duplicate columns due to the spacing. To fix this, I changed it to sep = ’ / ’, with spaces around the /. This combined the data into the 6 options for use.
ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "green", alpha=0.4) + coord_flip() +
labs(x="Uses", y= "Feet") +
ggtitle('Mean Height of Buildings in Country')
ggplot(UseMeanFeet,aes(reorder(Use,number),number)) + geom_bar(stat="identity", fill = "green", alpha=0.4) +
labs(x="Uses", y= "Feet") +
ggtitle('Mean Height of Buildings in Country') +
theme(axis.text.x = element_text(angle = 90))