Click the Original, Code and Reconstruction tabs to read about the issues and how they were fixed.
Objective
Objective of visualization: To provide information of the top 100 websites in USA ranked by Traffic.
Target Audience : Business owners that are looking to own a website and improve their brand,website analytics companies and General Public.
The visualization chosen had the following three main issues:
Reference
The following code was used to fix the issues identified in the original.
library(tidyr)
library(dplyr)
library(readr)
library(knitr)
library(rvest)
library(stringi)
library(ggplot2)
#importing the data.
websites_100 <- read_html("https://www.semrush.com/blog/most-visited-websites/")
websites_100_node <- html_nodes(websites_100,"table")
websites_100.df <- html_table(websites_100_node[[1]])
total_rows <-c(1:100)
#Transforming desktop and mobile share columns
#creating 2 new columns desk_percent and mob_percent that hold the value of percentage of traffic generated.
desk_vis <-websites_100.df$`Desktop share`
for (i in total_rows) {
temp1 <-as.list(strsplit(desk_vis[i], "%")[[1]])
websites_100.df$desk_percent[i]<-temp1[[1]]
}
mob_vis <-websites_100.df$`Mobile share`
for (i in total_rows) {
temp2 <-as.list(strsplit(mob_vis[i], "%")[[1]])
websites_100.df$mob_percent[i]<-temp2[[1]]
}
#converting the traffic into millions metric
visit_1<-0
for (i in total_rows) {
if (stri_count(websites_100.df$Visits[i], regex="B")==1)
{
temp3<-strsplit(websites_100.df$Visits[i], "B")
visit_1[i]<-as.double(temp3)*1000
} else if (stri_count(websites_100.df$Visits[i], regex="M")==1)
{
temp3<-strsplit(websites_100.df$Visits[i], "M")
visit_1[i]<-as.double(temp3)
}
}
websites_100.df$traffic_count <-visit_1
#This contains the categories of top-100 websites
Category <- c('Search Engines','Streaming','Social media','Retail and E-commerce','Informational databases','Web Portals','Web and File Hosting','Porn ','Social media','Social media','Retail and E-commerce','Social media','Social media','News','Ad Networks','Porn ','Retail and E-commerce','Weather news','Sports','Informational databases','News','Ad Networks','Search Engines','Technology','Porn ','Email','Snail Mail','Web Portals','Retail and E-commerce','Ad Networks','Retail and E-commerce','Business','Search Engines','Retail and E-commerce','Social media','News','Weather news','Email','Streaming','Social media','Video communication','Business','Streaming','Retail and E-commerce','Health info','Informational databases','Retail and E-commerce','Retail and E-commerce','Network Communications','Snail Mail','Technology','Snail Mail','News','Banking','News','Web and File Hosting','Network Communications','Web Portals','Non-profit organisations','Business','Banking','Web and File Hosting','News','Web Portals','Porn ','Weather news','Retail and E-commerce','Banking','Technology','Email','Medical agency','News','Banking','News','News','Web and File Hosting','News','Porn ','Games','Technology','Streaming','News','Non-profit organisations','Business','Business','Non-profit organisations','News','News','Streaming','Banking','Social media','Technology','Porn ','Retail and E-commerce','Technology','Technology','News','Social media','Retail and E-commerce','Retail and E-commerce')
#Anew column named Category is added.
websites_100.df$Category <-Category
#dropping unwanted columns
websites_100.df_new <- websites_100.df[ -c(3:8) ]
#the traffic rank that contains world rankings is replaced by usa traffic rankins.
Rank <-c(1:100)
websites_100.df_new$`Traffic rank`<-Rank
#Mob_percent and desk_percent type conversion is done.
websites_100.df_new$desk_percent<-as.double(websites_100.df_new$desk_percent)
websites_100.df_new$mob_percent<-as.double(websites_100.df_new$mob_percent)
#part-1: displaying the Rankings and traffic amount of top 100 websites.
ranked_bar_cht <- ggplot(websites_100.df_new, aes(x = reorder(Domain, +traffic_count), y = traffic_count)) +
geom_bar(width=0.8,position=position_dodge(width=10),stat = "identity", fill="darkturquoise") +
theme_classic(base_size = 17)+
theme(axis.text.x = element_text(angle = 0))+
labs(title="Rankings of Top-100 visited Domains in USA",subtitle="[2021 Edition]")+
ylab("Traffic(in Millions)") +
xlab("Domain Names") +geom_text(aes(label=traffic_count),check_overlap = TRUE,hjust=-0.4,size=3.6)+
coord_flip()
#, vjust=-0.3, size=3.5
#Categorizing the domains based on the types.
category_counts<-websites_100.df_new %>% group_by(Category) %>% summarise(no_of_categories = n())
website_100_category.df<-as.data.frame(category_counts)
colnames(website_100_category.df) <- c("Categories", "counts") # change column names
#Domains are categorized and displayed as a facted chart.
categorized_domains<- ggplot(website_100_category.df, aes(x=Categories, y=counts, fill=Categories)) +
geom_bar(stat="identity")+theme_classic(base_size = 16) +
scale_fill_manual(values=c(
"coral3","chocolate4","chartreuse4","cadetblue4",
"darkslategray","brown4","darkseagreen4","bisque4","aquamarine4","purple",
"darkorchid3","black","cornflowerblue","cornsilk4","goldenrod2",
"firebrick4","mistyrose3","dodgerblue4","wheat4","skyblue4","slateblue","slateblue1","cyan4"))+
labs(title="Categories of Top-100 visited Domains in USA(2021)")+
ylab("Number of Domains") +
xlab("Domain Categories") + coord_flip()
#part3 :The cumulative percentage of desktop and mobile traffic for top-100 websites is displayed.
users.df <- rbind(
data.frame( "visit_type"="Desktop","percentage" = mean(websites_100.df_new$desk_percent)),
data.frame("visit_type"="Mobile","percentage" =mean(websites_100.df_new$mob_percent))
)
traffic_type_pie_cht <- ggplot(users.df, aes(x = "", y=percentage, fill = factor(visit_type))) +
geom_bar(width = 1, stat = "identity") +
theme(axis.line = element_blank(),
plot.title = element_text(hjust=0.5))+labs(fill="Categories",
x=NULL,
y=NULL,
title="Pie Chart of categories") +
coord_polar(theta = "y", start=0)+
geom_text(aes(label=percentage), vjust=-5, size=3.5)+scale_fill_brewer(palette="BuGn")
Data Reference
The following plot fixes the main issues in the original.