library(ggplot2)
library(dplyr)
library(corrplot)
library(corrgram)
library(ggmap)
InternetPenetrationRate
as the ratio Population
over Internet Population
#read csv file
df<-read.csv('./Workbook3.csv',sep=',',header=FALSE,skip=0)
#set header
colnames(df)<-c('image','rank','Country','Population','Internet_Population','Total_Revenu_US_Dollar')
#convert factor into numeric, remove periods
df$Population<-as.numeric(gsub("[[:punct:]]","",as.character(df$Population)))
df$Internet_Population <-as.numeric(gsub("[[:punct:]]","",as.character(df$Internet_Population)))
df$Total_Revenu_US_Dollar <-as.numeric(gsub("[[:punct:]]","",as.character(df$Total_Revenu_US_Dollar)))
#define internet Pentration rate
df$InternetPenetrationRate<-df$Population/df$Internet_Population
#re-order column by Total Revenu desc.
df$ReorderedType <- reorder(df$Country, -df$Total_Revenu_US_Dollar)
ggplot(data=df,aes(x=ReorderedType, y=Total_Revenu_US_Dollar))+ geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1,vjust = 0.5,size=8)) + scale_y_log10() + xlab("Country") + ylab("Total Revenues ($US)")
ggplot(data=df,aes(x=ReorderedType, y=Total_Revenu_US_Dollar, fill= Population))+ geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1,vjust = 0.5,size=8),legend.position="top") + scale_y_log10()+ xlab("Country") + ylab("Total Revenues ($US)")
ggplot(data=df,aes(x= InternetPenetrationRate)) + geom_histogram(bins=100)
We see that the distribution has 2 outliers :
filter(df, InternetPenetrationRate>10)
## image rank Country Population Internet_Population
## 1 tt.svg 96 Trinidad and Tobago 1365000 989
## 2 cy.svg 100 Cyprus 1177000 880
## Total_Revenu_US_Dollar InternetPenetrationRate ReorderedType
## 1 16049000 1380.182 Trinidad and Tobago
## 2 13677000 1337.500 Cyprus
For the next plot, I filter the data to remove these 2 outliers (visualization purpose only)
ggplot(data=filter(df,InternetPenetrationRate<10),aes(x= InternetPenetrationRate)) + geom_histogram(bins=100)
ggplot(data=filter(df,InternetPenetrationRate<10),aes(x=ReorderedType, y=Total_Revenu_US_Dollar, fill= InternetPenetrationRate))+ geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1,vjust = 0.5,size=8),legend.position="top") + scale_y_log10()+ xlab("Country") + ylab("Total Revenues ($US)")
df %>% filter(InternetPenetrationRate>4 & InternetPenetrationRate<10) %>% select(Country,rank,Total_Revenu_US_Dollar, InternetPenetrationRate)
## Country rank Total_Revenu_US_Dollar InternetPenetrationRate
## 1 Indonesia 17 598074000 4.075351
## 2 Pakistan 57 89702000 4.949231
## 3 Bangladesh 68 50859000 6.156878
## 4 Iraq 69 49975000 5.167630
## 5 Myanmar 70 48196000 4.107283
Total Revenues
#define a function to return the longitude/latitude from ggmpa
getGeo<-function(x,coord){
for(i in 1:length(x)){
val<-as.numeric(geocode(as.character(x[i])))[coord]
return(val)
}
}
df$geoLon<-sapply(df$Country,getGeo,1)
df$geoLat<-sapply(df$Country,getGeo,2)
map<-ggplot() + borders("world",colour="grey75",fill="white")
map + geom_point(data=filter(df, InternetPenetrationRate<10),aes(x=geoLon, y=geoLat,size=Total_Revenu_US_Dollar,color=InternetPenetrationRate),alpha=.5) + theme(legend.text=element_text(size=5),legend.position="top") + stat_ellipse(data=filter(df, Country=='Indonesia' | Country=='Bangladesh' | Country=='Pakistan' | Country=='Iraq' | Country=='Myanmar'),aes(x=geoLon, y=geoLat),size=.5)
Comments : it seems that the Countries
for which the InternetPenetrationRate
is above 4 (see table above, symbolized roughly by the ellipse) are well located around a given axis : Iraq, Pakistan, Indonesia
cor.data <- cor(df[,c('Population','Internet_Population','Total_Revenu_US_Dollar','InternetPenetrationRate')])
corrPLOT<-corrplot(cor.data,method='ellipse')
To really show how China
is a big player, we can plot the correlation between its revenue as a function of the Internet Population
ggplot(data=filter(df, InternetPenetrationRate<10),aes(x= Internet_Population,y= Total_Revenu_US_Dollar)) + geom_point(aes(color=Population,size= InternetPenetrationRate),alpha=.4) + geom_text(aes(label=ifelse(Total_Revenu_US_Dollar >1e10 | Internet_Population>3e8,as.character(Country),''),hjust=-.15, vjust=0)) + xlim(0,8.5e8) + theme(legend.position="top")
History :