library(ggplot2)
library(dplyr)
library(corrplot)
library(corrgram)
library(ggmap)

Introduction

Data

#read csv file
df<-read.csv('./Workbook3.csv',sep=',',header=FALSE,skip=0)
#set header
colnames(df)<-c('image','rank','Country','Population','Internet_Population','Total_Revenu_US_Dollar')
#convert factor into numeric, remove periods
df$Population<-as.numeric(gsub("[[:punct:]]","",as.character(df$Population)))
df$Internet_Population <-as.numeric(gsub("[[:punct:]]","",as.character(df$Internet_Population)))
df$Total_Revenu_US_Dollar <-as.numeric(gsub("[[:punct:]]","",as.character(df$Total_Revenu_US_Dollar)))
#define internet Pentration rate
df$InternetPenetrationRate<-df$Population/df$Internet_Population
#re-order column by Total Revenu desc.
df$ReorderedType <- reorder(df$Country, -df$Total_Revenu_US_Dollar)

Plots

ggplot(data=df,aes(x=ReorderedType, y=Total_Revenu_US_Dollar))+ geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1,vjust = 0.5,size=8)) + scale_y_log10() + xlab("Country") + ylab("Total Revenues ($US)")

ggplot(data=df,aes(x=ReorderedType, y=Total_Revenu_US_Dollar, fill= Population))+ geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1,vjust = 0.5,size=8),legend.position="top") + scale_y_log10()+ xlab("Country") + ylab("Total Revenues ($US)")

Internet Penetration Rate distribution

ggplot(data=df,aes(x= InternetPenetrationRate)) + geom_histogram(bins=100)

We see that the distribution has 2 outliers :

filter(df, InternetPenetrationRate>10)
##    image rank             Country Population Internet_Population
## 1 tt.svg   96 Trinidad and Tobago    1365000                 989
## 2 cy.svg  100              Cyprus    1177000                 880
##   Total_Revenu_US_Dollar InternetPenetrationRate       ReorderedType
## 1               16049000                1380.182 Trinidad and Tobago
## 2               13677000                1337.500              Cyprus

For the next plot, I filter the data to remove these 2 outliers (visualization purpose only)

ggplot(data=filter(df,InternetPenetrationRate<10),aes(x= InternetPenetrationRate)) + geom_histogram(bins=100)

ggplot(data=filter(df,InternetPenetrationRate<10),aes(x=ReorderedType, y=Total_Revenu_US_Dollar, fill= InternetPenetrationRate))+ geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1,vjust = 0.5,size=8),legend.position="top") + scale_y_log10()+ xlab("Country") + ylab("Total Revenues ($US)")

df %>% filter(InternetPenetrationRate>4 & InternetPenetrationRate<10) %>% select(Country,rank,Total_Revenu_US_Dollar, InternetPenetrationRate)
##      Country rank Total_Revenu_US_Dollar InternetPenetrationRate
## 1  Indonesia   17              598074000                4.075351
## 2   Pakistan   57               89702000                4.949231
## 3 Bangladesh   68               50859000                6.156878
## 4       Iraq   69               49975000                5.167630
## 5    Myanmar   70               48196000                4.107283

Map of Total Revenues

#define a function to return the longitude/latitude from ggmpa
getGeo<-function(x,coord){
for(i in 1:length(x)){
    val<-as.numeric(geocode(as.character(x[i])))[coord]
    return(val)
    }
}
df$geoLon<-sapply(df$Country,getGeo,1)
df$geoLat<-sapply(df$Country,getGeo,2)
map<-ggplot() + borders("world",colour="grey75",fill="white") 
map + geom_point(data=filter(df, InternetPenetrationRate<10),aes(x=geoLon, y=geoLat,size=Total_Revenu_US_Dollar,color=InternetPenetrationRate),alpha=.5) + theme(legend.text=element_text(size=5),legend.position="top") +  stat_ellipse(data=filter(df, Country=='Indonesia' | Country=='Bangladesh' | Country=='Pakistan' | Country=='Iraq' | Country=='Myanmar'),aes(x=geoLon, y=geoLat),size=.5)

Comments : it seems that the Countries for which the InternetPenetrationRate is above 4 (see table above, symbolized roughly by the ellipse) are well located around a given axis : Iraq, Pakistan, Indonesia

Correlation

cor.data <- cor(df[,c('Population','Internet_Population','Total_Revenu_US_Dollar','InternetPenetrationRate')])
corrPLOT<-corrplot(cor.data,method='ellipse')

To really show how China is a big player, we can plot the correlation between its revenue as a function of the Internet Population

ggplot(data=filter(df, InternetPenetrationRate<10),aes(x= Internet_Population,y= Total_Revenu_US_Dollar)) + geom_point(aes(color=Population,size= InternetPenetrationRate),alpha=.4) + geom_text(aes(label=ifelse(Total_Revenu_US_Dollar >1e10 | Internet_Population>3e8,as.character(Country),''),hjust=-.15, vjust=0)) + xlim(0,8.5e8) + theme(legend.position="top")

Acknoledgment Data available for free use from NEWZOO

History :