library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.3.3
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
library(vcd)
## Warning: package 'vcd' was built under R version 3.3.3
## Loading required package: grid
library(GGally)
## Warning: package 'GGally' was built under R version 3.3.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.3.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.3.3
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.3.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggmosaic)
## Warning: package 'ggmosaic' was built under R version 3.3.3
## Loading required package: productplots
## Warning: package 'productplots' was built under R version 3.3.3
##
## Attaching package: 'productplots'
## The following objects are masked from 'package:vcd':
##
## mosaic, spine, tile
##
## Attaching package: 'ggmosaic'
## The following objects are masked from 'package:productplots':
##
## ddecker, hspine, mosaic, prodcalc, spine, vspine
## The following objects are masked from 'package:vcd':
##
## mosaic, spine
library(readr)
## Warning: package 'readr' was built under R version 3.3.3
library(cowplot)
library(RColorBrewer)
library(vcd)
library(GGally)
Program: Master of Data Science@RMIT Subject: Data visualisation
Bicycle <- read_csv("C:/RMIT/Data-Visualisation/R/Bicycle.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## SortDes = col_character(),
## DS_LOCATION = col_character(),
## DT_ANALYSIS_SUMMARY = col_character(),
## DS_HOLIDAY = col_character(),
## Primary = col_logical(),
## weekend = col_logical(),
## Season = col_character(),
## `Cyclying Season` = col_character(),
## day = col_character()
## )
## See spec(...) for full column specifications.
Bicyclesum <-Bicycle %>% group_by(SortDes) %>% summarise(count = n())
Bicyclesum$Proportion <-Bicyclesum$count/nrow(Bicycle)
Bicyclesum$Percent <- Bicyclesum$count/nrow(Bicycle)*100
Bicyclesum$SortDes <- Bicyclesum$SortDes %>% factor(levels = Bicyclesum$SortDes[order(-Bicyclesum$count)])
Bicyclesum
## # A tibble: 34 x 4
## SortDes count Proportion Percent
## <fctr> <int> <dbl> <dbl>
## 1 Albert St E Bound Lane, Melbourne City 1056 0.018525341 1.8525341
## 2 Albert St W Bound Lane, Melbourne City 1056 0.018525341 1.8525341
## 3 Anniversary Trail No.1 2482 0.043541568 4.3541568
## 4 Anniversary Trail No.2 Kew 1980 0.034735014 3.4735014
## 5 Bay Trail in St Kilda 2496 0.043787169 4.3787169
## 6 Brighton Rd N Bound Lane 1129 0.019805975 1.9805975
## 7 Brighton Rd S Bound Lane 1129 0.019805975 1.9805975
## 8 Cannings Street, Carlton 2314 0.040594355 4.0594355
## 9 Capital City Trail, Princes Hill 2489 0.043664369 4.3664369
## 10 Federation Trail, Hobsons Bay 563 0.009876673 0.9876673
## # ... with 24 more rows
p1 <- ggplot(Bicyclesum, aes(x = SortDes, y = count))
p1 + geom_bar(stat = "identity")
d <- ggplot(data = Bicycle,aes(x=reorder( Bicycle$SortDes, Bicycle$SortDes,
function(x)+length(x))))
d + geom_bar(stat="count",color="white",fill = "#008040") + ggtitle(" Bar plot for the bike path (S3613572)") +coord_flip()+xlab("Bicycle Path")+
theme_minimal()
ggsave("plot5-1.png",
width = 18, height = 12, units = "cm")
Q2
Bicyclesum$SortDes <- factor(Bicyclesum$SortDes, levels = Bicyclesum$SortDes[order(Bicyclesum$count)])
p2 <- ggplot(Bicyclesum, aes( y=Bicyclesum$SortDes, x= Percent))
p2 + geom_point(colour = "dodgerblue3")+ geom_segment(aes(x = 0, y = Bicyclesum$SortDes, xend = Percent,yend=Bicyclesum$SortDes),linetype = 2)+
labs(title = "Dot plot for the bike path (S3613572)",
x = "Percent(%)",
y = "Bycycle Path")+ geom_text(aes(label=round(Percent,2)), hjust = -.2,size = 3) +
scale_x_continuous(limits = c(0,6))+
theme_classic()
ggsave("plot5-2.png",
width = 18, height = 12, units = "cm")
Bicycle_filter <- Bicycle %>% filter(Bicycle$SortDes == 'Royal Pde S Bound Lane'&Bicycle$weekend == "FALSE")
## Warning: package 'bindrcpp' was built under R version 3.3.3
Q3
d <- ggplot(data = Bicycle_filter,aes(x=Bicycle_filter$SortDes, y = CT_VOLUME_24HOUR, fill = CT_VOLUME_24HOUR ))
d + geom_boxplot() + ggtitle(" Box plot of the distribution of CT VOLUME 24HOUR (S3613572)")+ylab("CT VOLUMN 24HOUR")+
xlab("Weekday traffic of Royal Pde S Bound Lane ")
ggsave("plot5-3.png",
width = 18, height = 12, units = "cm")
Q4
ggplot(Bicycle_filter, aes(x = Bicycle_filter$CT_VOLUME_24HOUR)) +
geom_histogram(bin =20,stat = "bin",colour="floralwhite",fill="deeppink2",alpha = 1/2)+
ggtitle("Histogram of CT VOLUME 24HOUR (S3613572)")+
xlab("CT VOLUME 24HOUR ")
## Warning: Ignoring unknown parameters: bin
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("plot5-4.png",
width = 18, height = 12, units = "cm")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Q6
ggplot(Bicycle_filter, aes(x = Bicycle_filter$CT_VOLUME_24HOUR)) +
geom_density(bin =20,stat = "bin",colour="floralwhite",fill="deeppink2",alpha = 1/2)+
ggtitle("Density of CT VOLUME 24HOUR (S3613572)")+
xlab("CT VOLUME 24HOUR ")
## Warning: Ignoring unknown parameters: bin
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("plot5-6.png",
width = 18, height = 12, units = "cm")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Bicycle, aes(x = Bicycle$CT_VOLUME_24HOUR)) +
geom_density(bin =30,stat = "bin",colour="floralwhite",fill="deeppink2",alpha = 1/2)+
ggtitle("Density of CT VOLUME 24HOUR (S3613572)")+
xlab("CT VOLUME 24HOUR ")
## Warning: Ignoring unknown parameters: bin
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("plot5-6.1.png",
width = 18, height = 12, units = "cm")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Q7
ggplot(Bicycle_filter, aes(Bicycle_filter$CT_VOLUME_24HOUR)) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
args = list(mean = mean(Bicycle_filter$CT_VOLUME_24HOUR), sd = sd(Bicycle_filter$CT_VOLUME_24HOUR)),
lwd = 2,
col = 'red') +ggtitle("Histogram of CT VOLUME 24HOUR (S3613572)")+
xlab("CT VOLUME 24HOUR ") +theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Bicycle_filter, aes(Bicycle_filter$CT_VOLUME_24HOUR)) +
geom_histogram(aes(y = ..density..)) +
geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(Bicycle_filter$CT_VOLUME_24HOUR,..density..),
alpha = 1/2,bins = 100) +ggtitle("Histogram of CT VOLUME 24HOUR (S3613572)")+
xlab("CT VOLUME 24HOUR ") +theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("plot5-7.png",
width = 18, height = 12, units = "cm")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Q8
p8 <- ggplot(Bicycle_filter, aes(Bicycle_filter$CT_VOLUME_24HOUR)) +
geom_histogram(aes(y = ..density..)) +
ggtitle("Histogram of CT VOLUME 24HOUR (S3613572)")+
xlab("CT VOLUME 24HOUR ") +theme_bw()
p8 <- p8 + geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(Bicycle_filter$CT_VOLUME_24HOUR,..density..),
alpha = 1/2,bins = 100) +
geom_vline(xintercept= median(Bicycle_filter$CT_VOLUME_24HOUR),size = 1.5) +
annotate("text",label = "Median",x = 1350, y = 0.0018) +
geom_vline(xintercept= mean(Bicycle_filter$CT_VOLUME_24HOUR),linetype=2,colour = "red", size = 1.5) +
annotate("text",label = "Mean",x = 1100, y = 0.002,colour = "red")
p8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("plot5-8.png",
width = 18, height = 12, units = "cm")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Q9
Bicycle_filter2 <- Bicycle %>% filter(Bicycle$SortDes == 'Royal Pde S Bound Lane'&Bicycle$weekend == "TRUE")
p9<- ggplot(data = Bicycle_filter2,aes(x=Bicycle_filter2$CT_VOLUME_24HOUR))
p9 + geom_dotplot(binwidth = 13) + ggtitle(" Stack dot plot of CT VOLUME 24HOUR on weekend (S3613572)")+xlab("CT VOLUME 24HOUR ")
ggsave("plot5-9.png",
width = 18, height = 12, units = "cm")
Q10
Bicycle_filter2 <- Bicycle %>% filter(Bicycle$SortDes == 'Royal Pde S Bound Lane'&Bicycle$weekend == "TRUE")
p10 <- ggplot(Bicycle_filter2, aes(x = factor(1),y=Bicycle_filter2$CT_VOLUME_24HOUR)) +
geom_boxplot(binwidth = .5) + scale_y_continuous(limits = c(0, 800))+ylab("CT VOLUME 24HOUR ")+ ggtitle(" Distribution CT VOLUME 24HOUR on weekend (S3613572)")+xlab("count")
## Warning: Ignoring unknown parameters: binwidth
p9<- ggplot(data = Bicycle_filter2,aes(x=Bicycle_filter2$CT_VOLUME_24HOUR))+geom_dotplot(binwidth = 8)+ scale_x_continuous(limits = c(0, 800)) +xlab("CT VOLUME 24HOUR ")
plot_grid(p10 + coord_flip()+ theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y = element_blank()), ncol=1, align="v",
rel_heights = c(1,2),p9 )
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing non-finite values (stat_bindot).
ggsave("plot5-10.png",
width = 18, height = 12, units = "cm")
Q11
Water <- read_csv("C:/RMIT/Data-Visualisation/R/Water_taste.csv")
## Parsed with column specification:
## cols(
## Gender = col_character(),
## Age = col_integer(),
## Class = col_character(),
## UsuallyDrink = col_character(),
## FavBotWatBrand = col_character(),
## Preference = col_character(),
## First = col_character(),
## Second = col_character(),
## Third = col_character(),
## Fourth = col_character()
## )
Water$First<-ifelse(Water$First=="A","Sam’s Choice",ifelse(Water$First == "B","Aquafina",ifelse(Water$First == "C","Fiji","Tap water")))
p11 <- ggplot(data = Water, aes(x = Water$First, fill = Gender))
p11 + geom_bar(position = "fill")
p11 + geom_bar(position = "dodge")+scale_fill_manual(values=c('pink','#999999'))+xlab("Preference")+ ggtitle("The association between gender and first preference (S3613572)")
ggsave("plot5-11.png",
width = 18, height = 12, units = "cm")
Q12
crosstab1<-table(Water$Gender,Water$First, dnn = c("Gender","Preference"))
prop.table(crosstab1, 1) #Row proportions
## Preference
## Gender Aquafina Fiji SamÂ’s Choice Tap water
## F 0.2207792 0.4285714 0.2467532 0.1038961
## M 0.3125000 0.3437500 0.2187500 0.1250000
prop.table(crosstab1, 2) #Column proportions
## Preference
## Gender Aquafina Fiji SamÂ’s Choice Tap water
## F 0.6296296 0.7500000 0.7307692 0.6666667
## M 0.3703704 0.2500000 0.2692308 0.3333333
crosstab1 <- data.frame(prop.table(crosstab1, 1)) #Convert proportion table to df
str(crosstab1) #Data frame summary
## 'data.frame': 8 obs. of 3 variables:
## $ Gender : Factor w/ 2 levels "F","M": 1 2 1 2 1 2 1 2
## $ Preference: Factor w/ 4 levels "Aquafina","Fiji",..: 1 1 2 2 3 3 4 4
## $ Freq : num 0.221 0.312 0.429 0.344 0.247 ...
colnames(crosstab1) <- c("Gender","Preference","Proportion") #Fix variable names
str(crosstab1)
## 'data.frame': 8 obs. of 3 variables:
## $ Gender : Factor w/ 2 levels "F","M": 1 2 1 2 1 2 1 2
## $ Preference: Factor w/ 4 levels "Aquafina","Fiji",..: 1 1 2 2 3 3 4 4
## $ Proportion: num 0.221 0.312 0.429 0.344 0.247 ...
p12 <- ggplot(data = crosstab1, aes(x = Preference,y = Proportion,fill = Gender))
p12 + geom_bar(stat = "identity",position = "dodge")+scale_fill_manual(values=c('pink','#999999'))+ ggtitle("Proportion association between gender and first preference (S3613572)")
ggsave("plot5-12.png",
width = 18, height = 12, units = "cm")
Q13
Q14
Seaice <- read_csv("C:/RMIT/Data-Visualisation/R/Sea_ice.csv")
## Parsed with column specification:
## cols(
## Year = col_integer(),
## Ice = col_double()
## )
p14 <- ggplot(data = Seaice, aes(x = Year, y = Ice))
p14 + geom_point()+ geom_smooth(method = "lm") + geom_smooth(colour = "red")+ ggtitle("Time series plot showing the change in Arctic sea ice across time (S3613572)")
## `geom_smooth()` using method = 'loess'
ggsave("plot5-14.png",
width = 18, height = 12, units = "cm")
## `geom_smooth()` using method = 'loess'
Q15
Shoes <- read_csv("C:/RMIT/Data-Visualisation/R/Shoesize.csv")
## Parsed with column specification:
## cols(
## Index = col_integer(),
## Gender = col_character(),
## Size = col_double(),
## Height = col_double()
## )
Q16
p16 <- ggplot(data = Shoes, aes(x =Shoes$Height , y = Shoes$Size))
p16 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red") +geom_rug(alpha = 1/2)+
ggtitle("The relationship between height and shoe size (S3613572)")+xlab("Height")+ylab("Size")
## `geom_smooth()` using method = 'loess'
ggsave("plot5-16.png",
width = 18, height = 12, units = "cm")
## `geom_smooth()` using method = 'loess'
Q18
p18 <- ggplot(data = Shoes, aes(x = Gender, y = Size,fill = Gender))
p18 + geom_boxplot(alpha = .25)+ggtitle("The shoe size between males and females (S3613572)")+theme_bw()
ggsave("plot5-18.png",
width = 18, height = 12, units = "cm")
Q19
Shoes_rank <- Shoes %>% group_by(Gender) %>% summarise(med = median(Size))
Shoes$Gender <- Shoes$Gender %>% factor(levels =Shoes_rank$Gender[order(-Shoes_rank$med)])
p19 <- ggplot(data = Shoes, aes(x = Gender, y = Size,fill = Gender))
p19 + geom_boxplot(alpha = .25)+ggtitle("The shoe size between males and females (S3613572)")+theme_bw()+ stat_summary(fun.y = "mean", geom = "point", colour = "red")
ggsave("plot5-19.png",
width = 18, height = 12, units = "cm")
Q20
Shoes_rank <- Shoes %>% group_by(Gender) %>% summarise(med = median(Size))
Shoes$Gender <- Shoes$Gender %>% factor(levels =Shoes_rank$Gender[order(-Shoes_rank$med)])
p20 <- ggplot(data = Shoes, aes(x = Gender, y = Size,fill = Gender))
p20 + geom_violin(alpha = .25)+ggtitle("The shoe size between males and females (S3613572)")+theme_bw()+ stat_summary(fun.y = "mean", geom = "point", colour = "red")+
stat_summary(fun.data = "mean_cl_boot", colour = "red",
geom = "errorbar", width = .2)
ggsave("plot5-20.png",
width = 18, height = 12, units = "cm")