Data retrieved from http://skyscrapercenter.com/buildings on 9/26/2015

Make country into different column - separate from city

suppressMessages(library(ggplot2))
suppressMessages(library(tidyr))
suppressMessages(library(dplyr))
suppressMessages(library(countrycode))  
suppressMessages(library(printr))


bldg=read.csv("tallestbuildings.csv")
head(bldg)
Rank Building.Name City HeightMeter HeightFeet Floors Completed Material Use
1 Burj Khalifa Dubai (AE) 828.0 2,717 163 2010 steel/concrete office / residential / hotel
2 Makkah Royal Clock Tower Hotel Mecca (SA) 601.0 1,972 120 2012 steel/concrete other / hotel
3 One World Trade Center New York City (US) 541.3 1,776 94 2014 composite office
4 TAIPEI 101 Taipei (TW) 508.0 1,667 101 2004 composite office
5 Shanghai World Financial Center Shanghai (CN) 492.0 1,614 101 2008 composite hotel / office
6 International Commerce Centre Hong Kong (CN) 484.0 1,588 108 2010 composite hotel / office
str(bldg)
## 'data.frame':    100 obs. of  9 variables:
##  $ Rank         : int  1 2 3 4 5 6 7 7 9 10 ...
##  $ Building.Name: Factor w/ 100 levels "23 Marina","4 World Trade Center",..: 15 53 65 79 73 40 68 69 100 96 ...
##  $ City         : Factor w/ 40 levels "Abu Dhabi (AE)",..: 9 23 28 35 32 13 19 19 27 6 ...
##  $ HeightMeter  : num  828 601 541 508 492 ...
##  $ HeightFeet   : Factor w/ 77 levels "1,001","1,002",..: 65 64 63 62 61 60 59 59 58 57 ...
##  $ Floors       : int  163 120 94 101 101 108 88 88 66 108 ...
##  $ Completed    : int  2010 2012 2014 2004 2008 2010 1998 1998 2010 1974 ...
##  $ Material     : Factor w/ 4 levels "composite","concrete",..: 4 4 1 1 1 1 1 1 1 3 ...
##  $ Use          : Factor w/ 13 levels "hotel","hotel / office",..: 7 8 6 6 2 2 6 6 2 6 ...
# Make HeightFeet numeric
bldg$HeightFeet=as.character(bldg$HeightFeet)
bldg$HeightFeet=gsub(",","",bldg$HeightFeet)
bldg$HeightFeet=as.numeric(bldg$HeightFeet)

# same as: bldg$HeightFeet=as.numeric(gsub(",","",as.character(bldg$HeightFeet)))

# http://astrostatistics.psu.edu/su07/R/html/base/html/regex.html

newbldg=bldg %>% separate(City, c("City","Country"),sep="[:punct:]") 

# Number of buildings in different countries

table(newbldg$Country)
AE AU CA CL CN GB JP KR KW MY QA RU SA TH TW US VN
21 2 1 1 36 1 1 2 2 4 1 4 3 1 2 17 1
# change country codes to country names

newbldg$Country=countrycode(newbldg$Country,"iso2c","country.name")

#table(newbldg$Country)

## dplyr used here

newbldg %>% group_by(Country)%>% 
  summarise(Count=length(Country))%>%
  ggplot(aes(x= reorder(Country,Count),y=Count)) +
    geom_bar(stat="identity") +xlab("")+coord_flip() +geom_text(aes(Country,Count+1, label=Count))

newbldg %>% group_by(Country)%>% 
  summarise(Mean.Height=round(mean(HeightFeet),2))%>%
    ggplot(aes(x= reorder(Country,Mean.Height),y=Mean.Height)) +
    geom_bar(stat="identity") +xlab("")+coord_flip() +geom_text(aes(Country,Mean.Height-100, label=Mean.Height),color="white")

newbldg %>% group_by(Country)%>% 
  summarise(Mean.Height=round(mean(HeightFeet),2),
            Count=length(Country))%>%
    ggplot(aes(x= reorder(Country,Mean.Height),y=Mean.Height,fill=as.factor(Count))) +
    geom_bar(stat="identity") +xlab("")+coord_flip() +geom_text(aes(Country,Mean.Height+75, label=Mean.Height))+ scale_fill_discrete(name="Number of \nBuildings")

newbldg %>% group_by(City)%>% 
  summarise(Count=length(City))%>%
  ggplot(aes(x= reorder(City,Count),y=Count)) +
    geom_bar(stat="identity") +xlab("")+coord_flip() +scale_y_continuous(breaks=c(1,2,3,4,6,7,18))

newbldg %>% group_by(City)%>% 
  summarise(Mean.Height=round(mean(HeightFeet),2))%>%
    ggplot(aes(x= reorder(City,Mean.Height),y=Mean.Height)) +
    geom_point() +xlab("")+coord_flip() 

utilbldg=newbldg %>% separate(Use, c("Use1","Use2","Use3"),sep="/")%>%
  gather(numuses,Use,Use1:Use3)%>%
  filter(Use!="")
head(utilbldg)
Rank Building.Name City Country HeightMeter HeightFeet Floors Completed Material numuses Use
1 Burj Khalifa Dubai United Arab Emirates 828.0 2717 163 2010 steel/concrete Use1 office
2 Makkah Royal Clock Tower Hotel Mecca Saudi Arabia 601.0 1972 120 2012 steel/concrete Use1 other
3 One World Trade Center New York City United States 541.3 1776 94 2014 composite Use1 office
4 TAIPEI 101 Taipei Taiwan, Province of China 508.0 1667 101 2004 composite Use1 office
5 Shanghai World Financial Center Shanghai China 492.0 1614 101 2008 composite Use1 hotel
6 International Commerce Centre Hong Kong China 484.0 1588 108 2010 composite Use1 hotel
#http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r

trim <- function (x) gsub("^\\s+|\\s+$", "", x)
utilbldg$Use <- trim(utilbldg$Use)

utilbldg%>%group_by(Use)%>% summarise(MeanHeight=mean(HeightFeet))
Use MeanHeight
hotel 1190.100
office 1166.222
other 1972.000
residential 1133.387
retail 1072.667
serviced apartments 1118.000
utilbldg %>% group_by(Use)%>%summarise(Count=length(Use))
Use Count
hotel 40
office 72
other 1
residential 31
retail 3
serviced apartments 2
utilbldg%>%group_by(Use)%>% summarise(MeanHeight=mean(HeightFeet),Count=length(Use))%>%
  ggplot(.,aes(x=Use,y=MeanHeight,fill=as.factor(Count)))+geom_bar(stat="identity")+geom_text(aes(Use,MeanHeight+40, label=round(MeanHeight,2)))+ scale_fill_discrete(name="Number of \nBuildings")+ylab("Mean Height, in feet")