#Qualitative Univariate visualisation ## Load and wrangle - msnbc
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
msnbc <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/msnbc.csv")
#limit data to first page visited
msnbc_sum <-msnbc %>% group_by(First) %>% summarise(count = n())
#Transform counts to percentages and proportions
msnbc_sum$Proportion <- msnbc_sum$count/nrow(msnbc)
msnbc_sum$Percent <- msnbc_sum$Proportion*100
#Reorder the levels of the First factor. Minus sign indicates descending order
msnbc_sum$First <- msnbc_sum$First %>%
factor(levels = msnbc_sum$First[order(-msnbc_sum$count)])
##Bar chart
#construct barchart. stat="identity" will skip the default transformations and base the plot on the already computed statistics.
p1 <- ggplot(msnbc_sum, aes(x = First, y = count))
p1 + geom_bar(stat = "identity")
#change count to percentage by adjusting y aesthetic
p2 <- ggplot(msnbc_sum, aes(x = First, y = Percent))
p2 + geom_bar(stat = "identity")
#fix x-axis tick mark labels and add descriptive labels
p2 + geom_bar(stat="identity") +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain")
#Add annotations
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(Percent,2)), vjust = -0.5,size = 3)
#change theme and colour
p2 + geom_bar(stat="identity",fill = "dodgerblue3" ) + theme_minimal() +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(Percent,2)), vjust = -0.5,size = 3)
##Scales
#Check Scales - you can grossly misrepresent data by andusting the scale
#filter to selected pages with similar results
msnbc_sum_filt <- msnbc_sum %>%
filter(First %in% c("tech", "local", "health", "bbs", "summary", "sports"))
#manipulate the y-axis scale - makes the differences appear much larger than they are
p2.2 <- ggplot(msnbc_sum_filt, aes(x = First, y = Percent))
p2.2 + geom_bar(stat = "identity") +
coord_cartesian(ylim=c(5.5,6.25))
#compare to y-axis correctly anchored at 0
p2.2 + geom_bar(stat = "identity")
## Dot plot
#now as a dot plot
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point()
#reverse the order from largest to smallest
msnbc_sum$First <- factor(msnbc_sum$First,
levels = msnbc_sum$First[order(msnbc_sum$count)])
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point()
#add trailing lines to connect the dot with each y-axis mark label
#the geom_segment() layer allows you to add lines to the plot. Use the x, y, xend and yend options to determine start and finish of each line. linetype=2 is a dashed line.
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point() +
geom_segment(aes(x = 0, y = First, xend = count,yend=First),linetype = 2)
#now change colour and assign data labels
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point(colour = "dodgerblue3") +
geom_segment(aes(x = 0, y = First, xend = count,yend=First),linetype = 2) +
labs(title = "Unique Visits to Different MSNBC.com \n Landing Pages \n 28/09/1999",
x = "No. of Unique Visitors",
y = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(count,2)), hjust = -.2,size = 3) +
scale_x_continuous(limits = c(0,350000))
## Pie chart
#need to use the coord_polar() option
p4 <- ggplot(msnbc_sum, aes(x = factor(1), y = count, fill = First))
p4 + geom_bar(stat="identity",width = 1) + coord_polar(theta = "y")
#filter to top 5 landing pages
msnbc_sum_top_five <- msnbc_sum %>% filter(rank(count) > 12)
#recalculate the proportions and percentages based on the top five plots
msnbc_sum_top_five$Proportion <- msnbc_sum_top_five$count/sum(msnbc_sum_top_five$count)
msnbc_sum_top_five$Percent <- msnbc_sum_top_five$Proportion*100
msnbc_sum_top_five<-msnbc_sum_top_five %>% arrange(desc(count))
#rerun pie chart using the top five filtered dataset
p5 <- ggplot(msnbc_sum_top_five, aes(x = factor(1), y = count, fill = First))
p5 + geom_bar(stat="identity", width = 1) + coord_polar(theta = "y")
#clean up pie chart and add data labels
p5 + geom_bar(stat="identity",width = 1) + coord_polar(theta = "y") +
theme(axis.ticks=element_blank(),
axis.title=element_blank(),
axis.text.y=element_blank(),
axis.text.x=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()) +
labs(fill = "Top Five Landing Pages") +
geom_text(aes(y = (cumsum(count)-count) +
(count/2),
label=round(Percent,2), angle = 0))
## Coxcomb diagram (polar air diagram)
p6 <- ggplot(msnbc_sum_top_five, aes(x = First,y=count,fill=First))
p6 + geom_bar(stat="identity",width = 1) + coord_polar()