#Qualitative Univariate visualisation ## Load and wrangle - msnbc

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
msnbc <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/msnbc.csv")

#limit data to first page visited
msnbc_sum <-msnbc %>% group_by(First) %>% summarise(count = n())

#Transform counts to percentages and proportions
msnbc_sum$Proportion <- msnbc_sum$count/nrow(msnbc)
msnbc_sum$Percent <- msnbc_sum$Proportion*100

#Reorder the levels of the First factor. Minus sign indicates descending order
msnbc_sum$First <- msnbc_sum$First %>%
  factor(levels = msnbc_sum$First[order(-msnbc_sum$count)])

##Bar chart

#construct barchart. stat="identity" will skip the default transformations and base the plot on the already computed statistics.
p1 <- ggplot(msnbc_sum, aes(x = First, y = count))
p1 + geom_bar(stat = "identity")

#change count to percentage by adjusting y aesthetic
p2 <- ggplot(msnbc_sum, aes(x = First, y = Percent))
p2 + geom_bar(stat = "identity")

#fix x-axis tick mark labels and add descriptive labels
p2 + geom_bar(stat="identity") + 
  theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999", 
     y = "Percentage of Unique Visitors", 
     x = "Landing Page within MSNBC.com Domain")

#Add annotations
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(Percent,2)), vjust = -0.5,size = 3)

#change theme and colour
p2 + geom_bar(stat="identity",fill = "dodgerblue3" ) + theme_minimal() +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(title = "Unique Visits to Different MSNBC.com Landing Pages \n 28/09/1999",
y = "Percentage of Unique Visitors",
x = "Landing Page within MSNBC.com Domain") +
geom_text(aes(label=round(Percent,2)), vjust = -0.5,size = 3)

##Scales

#Check Scales - you can grossly misrepresent data by andusting the scale
#filter to selected pages with similar results

msnbc_sum_filt <- msnbc_sum %>%
filter(First %in% c("tech", "local", "health", "bbs", "summary", "sports"))

#manipulate the y-axis scale - makes the differences appear much larger than they are
p2.2 <- ggplot(msnbc_sum_filt, aes(x = First, y = Percent))
p2.2 + geom_bar(stat = "identity") + 
  coord_cartesian(ylim=c(5.5,6.25))

#compare to y-axis correctly anchored at 0
p2.2 + geom_bar(stat = "identity")

## Dot plot

#now as a dot plot
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point()

#reverse the order from largest to smallest
msnbc_sum$First <- factor(msnbc_sum$First, 
                          levels = msnbc_sum$First[order(msnbc_sum$count)])
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point()

#add trailing lines to connect the dot with each y-axis mark label
#the geom_segment() layer allows you to add lines to the plot. Use the x, y, xend and yend options to determine start and finish of each line. linetype=2 is a dashed line.
p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point() + 
  geom_segment(aes(x = 0, y = First, xend = count,yend=First),linetype = 2)

#now change colour and assign data labels

p3 <- ggplot(msnbc_sum, aes(y = First, x = count))
p3 + geom_point(colour = "dodgerblue3") +
  geom_segment(aes(x = 0, y = First, xend = count,yend=First),linetype = 2) +
  labs(title = "Unique Visits to Different MSNBC.com \n Landing Pages \n 28/09/1999",
       x = "No. of Unique Visitors",
       y = "Landing Page within MSNBC.com Domain") +
  geom_text(aes(label=round(count,2)), hjust = -.2,size = 3) +
  scale_x_continuous(limits = c(0,350000))

## Pie chart

#need to use the coord_polar() option

p4 <- ggplot(msnbc_sum, aes(x = factor(1), y = count, fill = First))
p4 + geom_bar(stat="identity",width = 1) + coord_polar(theta = "y")

#filter to top 5 landing pages
msnbc_sum_top_five <- msnbc_sum %>% filter(rank(count) > 12)

#recalculate the proportions and percentages based on the top five plots
msnbc_sum_top_five$Proportion <- msnbc_sum_top_five$count/sum(msnbc_sum_top_five$count)
msnbc_sum_top_five$Percent <- msnbc_sum_top_five$Proportion*100
msnbc_sum_top_five<-msnbc_sum_top_five %>% arrange(desc(count))

#rerun pie chart using the top five filtered dataset
p5 <- ggplot(msnbc_sum_top_five, aes(x = factor(1), y = count, fill = First))
p5 + geom_bar(stat="identity", width = 1) + coord_polar(theta = "y")

#clean up pie chart and add data labels
p5 + geom_bar(stat="identity",width = 1) + coord_polar(theta = "y") +
  theme(axis.ticks=element_blank(),
        axis.title=element_blank(),
        axis.text.y=element_blank(),
        axis.text.x=element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank()) +
  labs(fill = "Top Five Landing Pages") +
  geom_text(aes(y = (cumsum(count)-count) +
                  (count/2),
                label=round(Percent,2), angle = 0))

## Coxcomb diagram (polar air diagram)

p6 <- ggplot(msnbc_sum_top_five, aes(x = First,y=count,fill=First))
p6 + geom_bar(stat="identity",width = 1) + coord_polar()