world_population<-read.csv("https://raw.githubusercontent.com/chirag-vithlani/607/master/project2/world_populationOnlyCountry.csv", header= TRUE,sep=",",na.strings = "?",stringsAsFactors=FALSE)
dplyr’s select() functioni.e. selecting only year last 30 years of data ( with gap of 10 years )
library(dplyr)
# World population subset
wp_subset<- select(world_population,Country.Name,Country.Code,X1993,X2003,X2013)
head(wp_subset,1)
## Country.Name Country.Code X1993 X2003 X2013
## 1 Afghanistan AFG 14824371 22507368 30682500
#other way to achieve same thing
#wp_subset<-subset(world_population, select=c("Country.Name","X1993","X2003","X2013"))
rename() function of plyrsuppressWarnings(detach("package:dplyr", unload=TRUE)) # this is required step as dplyr and plyr doesn't work togather
# Getting reuired world population data
suppressWarnings(library(plyr))
wp<-rename(wp_subset, c("Country.Name"="country","Country.Code"="code","X1993"="1993", "X2003"="2003","X2013"="2013"))
head(wp,1)
## country code 1993 2003 2013
## 1 Afghanistan AFG 14824371 22507368 30682500
gather() function to get data year wise.Putting ‘year’ information in rows
suppressWarnings(library(dplyr)) #dplyr is a powerful R-package to transform and summarize tabular data with rows and columns.
# gather() makes wide data longer (here we are removing 3rd to 5th column)
wp_year_wise<-gather(wp, "Year", "Count", 3:5)
head(wp_year_wise,1)
## country code Year Count
## 1 Afghanistan AFG 1993 14824371
filter function.suppressWarnings(library(plotly))
#wp_subset$percent<-round((wp_subset$X2013/total_wp13)*100,2)
wp_year_wise <- wp_year_wise %>% filter(!is.na(wp_year_wise$Count))
This is bit complex step, although formula is simple, just divide country population with total world population, but we need to find this total population with respect to ( in that particular ) year.
First finding total population in ( that !! ) year.
we do this by using functions mutate(), sum() and group_by()
(Note : There was row in source data for ’world` too (that would make life easier) , but I choose to remove it and kept only country specific data.)
wp_year_wise<- wp_year_wise %>%
group_by(Year) %>%
mutate(Total_Population=sum(as.numeric(Count)))
#just to see what data generated (after above step)
subset( wp_year_wise, wp_year_wise$code=='USA')
## Source: local data frame [3 x 5]
## Groups: Year [3]
##
## country code Year Count Total_Population
## (chr) (chr) (chr) (int) (dbl)
## 1 United States USA 1993 259919000 5549751644
## 2 United States USA 2003 290107933 6369167987
## 3 United States USA 2013 316497531 7192695310
wp_year_wise$percent<-round((wp_year_wise$Count/wp_year_wise$Total_Population)*100,4)
#save(wp_year_wise, file="mydata.RData")
We all know China and India are most populous countries,but how much with respect to world population.
Population_India_China_2013<-subset(wp_year_wise,wp_year_wise$Year==2013 & (wp_year_wise$code=='IND' | wp_year_wise$code=='CHN'))
India_China_2013_Sum_percent<-sum(Population_India_China_2013$percent)
dataset<-data.frame(labels = c(" India + China "," Rest"),values = c(India_China_2013_Sum_percent,100-India_China_2013_Sum_percent))
plot_ly(dataset, labels = labels, values = values, type = "pie") %>%
layout(title = "China + India Population v/s Rest Of Wolrd")
#library(sqldf)
wpUCI<-subset(wp_year_wise,wp_year_wise$country=='India' | wp_year_wise$country=='China' | wp_year_wise$country=='United States')
head(wpUCI,1)
## Source: local data frame [1 x 6]
## Groups: Year [1]
##
## country code Year Count Total_Population percent
## (chr) (chr) (chr) (int) (dbl) (dbl)
## 1 China CHN 1993 1178440000 5549751644 21.2341
plot_ly(wpUCI, x = Year, y = code, size = percent,color = percent,opacity = percent, mode = "markers")
US population is comparatively small, we always knew this, but this gives nice relative insight.Like initially China was leading, but now India is caching up. US population hasn’t changed much ( with respect to world ) as color of bubble doesn’t change with time.
world_forests<-read.csv("https://raw.githubusercontent.com/chirag-vithlani/607/master/project2/forests.csv", header= TRUE,sep=",",na.strings = "?",stringsAsFactors=FALSE)
dplyr’s select() functioni.e. selecting only year last 30 years of data ( with gap of 10 years )
wf_subset<-subset(world_forests, select=c("Country.Name","Country.Code","X1993","X2003","X2013"))
head(wf_subset)
## Country.Name Country.Code X1993 X2003 X2013
## 1 Aruba ABW 2.333333 2.333333 2.333333
## 2 Andorra AND 34.042553 34.042553 34.042553
## 3 Afghanistan AFG 2.067825 2.067825 2.067825
## 4 Angola AGO 48.609609 47.608567 46.607524
## 5 Albania ALB 28.574818 28.363504 28.227007
## 6 United Arab Emirates ARE 3.163876 3.722488 3.833493
#need to detach dplyr as plyr and dplyr both doesn't work
detach("package:dplyr", unload=TRUE)
## Warning: 'dplyr' namespace cannot be unloaded:
## namespace 'dplyr' is imported by 'tidyr' so cannot be unloaded
suppressWarnings(library(plyr))
wf<-rename(wf_subset, c("Country.Name"="country","Country.Code"="code","X1993"="1993", "X2003"="2003","X2013"="2013"))
head(wf,1)
## country code 1993 2003 2013
## 1 Aruba ABW 2.333333 2.333333 2.333333
library(dplyr)
wf_year_wise<-gather(wf, "Year", "Count", 3:5)
head(wf_year_wise,1)
## country code Year Count
## 1 Aruba ABW 1993 2.333333
wf <- wf_year_wise %>% filter(!is.na(wf_year_wise$Count))
head(wf,1)
## country code Year Count
## 1 Aruba ABW 1993 2.333333
mutate() and group_by() to achieve this.wf<- wf %>%
group_by(Year) %>%
mutate(Total_Forest_Area=sum(as.numeric(Count)))
wf$percent<-round((wf$Count/wf$Total_Forest_Area)*100,4)
#df <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
l <- list(color = toRGB("grey"), width = 0.5)
g <- list(
showframe = FALSE,
showcoastlines = FALSE,
projection = list(type = 'Mercator')
)
plot_ly(wf, z = Count, text = code, locations = code, type = 'choropleth', color = Count, colors = 'Greens', marker = list(line = l),colorbar = list(ticksuffix = '%', title = 'Forests Color Map')) %>% layout(title = 'Forests',geo = g)
I didn’t know Finaland has huge forestry.
Three columns are common among two datasets ( world population & forest )
Population_And_Forests<-merge(wf,wp_year_wise,by=c("country","code","Year"))
head(Population_And_Forests,1)
## country code Year Count.x Total_Forest_Area percent.x Count.y
## 1 Afghanistan AFG 1993 2.067825 6989.493 0.0296 14824371
## Total_Population percent.y
## 1 5549751644 0.2671
options(scipen = 999)
Population_And_Forests$forests_by_population<-round(((Population_And_Forests$Total_Forest_Area/Population_And_Forests$Total_Population)*100),8)
arrage() function to sort.head(Population_And_Forests)
## country code Year Count.x Total_Forest_Area percent.x Count.y
## 1 Afghanistan AFG 1993 2.067825 6989.493 0.0296 14824371
## 2 Afghanistan AFG 2003 2.067825 6965.984 0.0297 22507368
## 3 Afghanistan AFG 2013 2.067825 6983.402 0.0296 30682500
## 4 Albania ALB 1993 28.574818 6989.493 0.4088 3227287
## 5 Albania ALB 2003 28.363504 6965.984 0.4072 3039616
## 6 Albania ALB 2013 28.227007 6983.402 0.4042 2897366
## Total_Population percent.y forests_by_population
## 1 5549751644 0.2671 0.00012594
## 2 6369167987 0.3534 0.00010937
## 3 7192695310 0.4266 0.00009709
## 4 5549751644 0.0582 0.00012594
## 5 6369167987 0.0477 0.00010937
## 6 7192695310 0.0403 0.00009709
sorted<-arrange(Population_And_Forests,-forests_by_population)
head(sorted)
## country code Year Count.x Total_Forest_Area percent.x Count.y
## 1 Afghanistan AFG 1993 2.0678246 6989.493 0.0296 14824371
## 2 Albania ALB 1993 28.5748175 6989.493 0.4088 3227287
## 3 Algeria DZA 1993 0.6888241 6989.493 0.0099 27785977
## 4 American Samoa ASM 1993 91.4500000 6989.493 1.3084 50725
## 5 Andorra AND 1993 34.0425532 6989.493 0.4871 61003
## 6 Angola AGO 1993 48.6096094 6989.493 0.6955 12246786
## Total_Population percent.y forests_by_population
## 1 5549751644 0.2671 0.00012594
## 2 5549751644 0.0582 0.00012594
## 3 5549751644 0.5007 0.00012594
## 4 5549751644 0.0009 0.00012594
## 5 5549751644 0.0011 0.00012594
## 6 5549751644 0.2207 0.00012594
subset(sorted,select=c("country","forests_by_population"),sorted$code=="IND"|sorted$code=="USA"|sorted$code=="CHN")
## country forests_by_population
## 41 China 0.00012594
## 84 India 0.00012594
## 192 United States 0.00012594
## 244 China 0.00010937
## 287 India 0.00010937
## 397 United States 0.00010937
## 449 China 0.00009709
## 492 India 0.00009709
## 604 United States 0.00009709