Web Scrapping from the below stated Page

webpage <- read_html(“http://www.abs.gov.au/ausstats/abs@.nsf/mf/3310.0”)

There was no downloadable from this page , hence all stats are taken directly from this page

Of Note : A lot of blanked out function calls were used in the assessment of this data

but are intentionally blanked so as not to clutter this Assignment, and have

been left in as references for any further possible assessment

webpage <- read_html("http://www.abs.gov.au/ausstats/abs@.nsf/mf/3310.0")

This code below will generate two data frames that I’ll use to extract information from

Note that the echo = FALSE parameter prevents printing of the R code that generated the plot

## Create two Data frames for me to extract Data from
Marriage <- html_table(html_nodes(tbls, "table")[[1]],fill = TRUE)

#head(Marriage)
#str(Marriage)
#View(Marriage)

Divorce <- html_table(html_nodes(tbls, "table")[[2]],fill = TRUE)
#select particular rows

Col_1  <- Marriage[ ,3:11]   ##  this works this is column selection
#View(Col_1)
Col_1a  <- Marriage[ 4 , c(-1,-2,-3,-4)]
#View(Col_1a)
Col_2a  <- Divorce [4 ,]      ##  this is row selection
#View(Col_2)
Col_2a  <- Divorce [4 ,c( -1,-2,-3)]   ##  this is row selection , No. of Divorces
#View(Col_2a)
Col_2b  <- Divorce [10 ,c( -1,-2,-3)]  ## Involving Children
Col_2c  <- Divorce [19 ,c( -1,-2,-3)]  ## Male initiated
Col_2d  <- Divorce [20 ,c( -1,-2,-3)]  ## Female initiated
colnames(Col_1a)[1]<-"1996"
colnames(Col_1a)[2]<-"2006"
colnames(Col_1a)[3]<-"2012"
colnames(Col_1a)[4]<-"2013"
colnames(Col_1a)[5]<-"2014"
colnames(Col_1a)[6]<-"2015"
colnames(Col_1a)[7]<-"2016"

colnames(Col_2a)[1]<-"1996"
colnames(Col_2a)[2]<-"2006"
colnames(Col_2a)[3]<-"2012"
colnames(Col_2a)[4]<-"2013"
colnames(Col_2a)[5]<-"2014"
colnames(Col_2a)[6]<-"2015"
colnames(Col_2a)[7]<-"2016"
colnames(Col_2b)[1]<-"1996"
colnames(Col_2b)[2]<-"2006"
colnames(Col_2b)[3]<-"2012"
colnames(Col_2b)[4]<-"2013"
colnames(Col_2b)[5]<-"2014"
colnames(Col_2b)[6]<-"2015"
colnames(Col_2b)[7]<-"2016"
colnames(Col_2c)[1]<-"1996"
colnames(Col_2c)[2]<-"2006"
colnames(Col_2c)[3]<-"2012"
colnames(Col_2c)[4]<-"2013"
colnames(Col_2c)[5]<-"2014"
colnames(Col_2c)[6]<-"2015"
colnames(Col_2c)[7]<-"2016"
colnames(Col_2d)[1]<-"1996"
colnames(Col_2d)[2]<-"2006"
colnames(Col_2d)[3]<-"2012"
colnames(Col_2d)[4]<-"2013"
colnames(Col_2d)[5]<-"2014"
colnames(Col_2d)[6]<-"2015"
colnames(Col_2d)[7]<-"2016"
Combined <-  rbind.data.frame(Col_1a,Col_2a,Col_2b,Col_2c,Col_2d)         #Years 
#View(Combined)

Years <- c(1996, 2006, 2012, 2013, 2014, 2015, 2016) # need this column a little later

Combined2 <- t(Combined)  # Transpose the matrix to use for visuals
#str(Combined2)
#View(Combined2)
colnames(Combined2)[0]<-"Year"
colnames(Combined2)[1]<-"Num_Marriages"
colnames(Combined2)[2]<-"Num_Divorces"
colnames(Combined2)[3]<-"Div_Involving_Children"
colnames(Combined2)[4]<-"Male_initiated_Divorce"
colnames(Combined2)[5]<-"Female_initiated_Divorce"
#View(Combined2)
#str(Combined2)
Combined2 <- as.data.frame(cbind(Years, Combined2))
row.names(Combined2) <- NULL   ##  Remove row names, and renumber as normal
#View(Combined2)

Hit a wall here, couldn’t , wouldn’t be changed to numeric and keep the orig. values ??

Answer was in doin it the ’ol school method, original had a comma separator, hence

confusion everywhere – seriously ??

Combined3 <- as.matrix(Combined2)
Combined3 <- as.data.frame(Combined3)

# n <-list(Combined3$Years)
# n
Combined3$Years <- as.numeric(c(1996, 2006, 2012, 2013, 2014, 2015, 2016))

# n <-list(Combined3$`Num_Marriages`)
# n
Combined3$`Num_Marriages` <- as.numeric(c(106103, 114222, 123243, 118959, 121197, 113595, 118401))

# n <-list(Combined3$`Num_Divorces`)
# n
Combined3$`Num_Divorces` <- as.numeric(c(52466, 51375, 49917, 47638, 46498, 48517, 46604))

# n <-list(Combined3$`Div_Involving_Children`)
# n
Combined3$`Div_Involving_Children` <- as.numeric(c(28138, 25733, 24144, 22590, 21840, 23063, 21864))

# n <-list(Combined3$`Male_initiated_Divorce`)
# n
Combined3$`Male_initiated_Divorce` <- as.numeric(c(17005, 15171, 12958, 12329, 12090, 12178, 11763))

# n <-list(Combined3$`Female_initiated_Divorce`)
# n
Combined3$`Female_initiated_Divorce` <- as.numeric(c(24155, 20574, 17140, 15658, 15127, 15337, 14962))

# head(Combined3)
# str(Combined3)
# View(Combined3)                ##   I use these for checking everything
# attributes(Combined3)
# typeof(Combined3)
# mode(Combined3)

Save file incase I need it later as this site had no downloadable .csv or .xlxs or .pdf files

# write.csv(Combined3, file = "C:/Users/dan/Desktop/a Visualization/Combined3.csv", row.names = FALSE)

Now for the fun stuff

par(mfrow = c(1,1))
library(ggplot2)
theme_set(theme_bw())

# Plot
ggplot(Combined3, aes(x=Years, y=Num_Marriages)) + 
  geom_point(size=3) + 
  geom_segment(aes(x=Combined3$Years, 
                   xend=Combined3$Years, 
                   y=0, 
                   yend=Combined3$`Num_Marriages`)) + 
  labs(title="Lollipop Chart", 
       subtitle="Number of Marriages", 
       caption="source: abs.gov.au/ausstats/abs@.nsf/mf/3310.0") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

par(mfrow = c(1,1))
par(mfrow=c(1, 1))    ## this ones not so good
#library(ggplot2)
#library(Hmisc)      #------ gives a great plot  , this one not so good ??

p1 <- ggplot(data = Combined3, aes(x = Years, y = Num_Marriages))
p1 + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 1/2, alpha = .25) + 
  stat_summary(fun.y = "mean", geom = "point", colour = "red") +
  stat_summary(fun.data = "mean_cl_normal", colour = "red", 
               geom = "errorbar", width = .2) #
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing missing values (geom_errorbar).

par(mfrow = c(1,1))
par(mfrow = c(1,1))
#str(Combined3)

#library(ggplot2)
library(ggcorrplot)
theme_set(theme_bw())
# Correlation matrix

corr <- round(cor(Combined3), 1)

# Plot
ggcorrplot(corr, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of Australian Marriages and Divorces 
              for years 1996, 2006, 2012 - 2016",  ggtheme=theme_bw)

par(mfrow = c(1,1))
##                   Marginal Histogram / Boxplot     ## This one wont' work correctly in Rmd ??
par(mfrow = c(1,1))                                   ## I think its more to do with my data set, ?
# load package and data
#library(ggplot2)
library(ggExtra)

# Scatterplot
theme_set(theme_bw())  # pre-set the bw theme.

g <- ggplot(Combined3, aes(Male_initiated_Divorce, Female_initiated_Divorce)) + 
  geom_count() + 
  geom_smooth(method="lm", se=F)
g

#ggMarginal(g, type = "histogram", fill="transparent")  ##  Select 1 for top and right
#ggMarginal(g, type = "boxplot", fill="transparent")    ##  side view
#ggMarginal(g, type = "density", fill="transparent")
par(mfrow = c(1,1))
#library(ggplot2)
theme_set(theme_bw())

# plot
g <- ggplot(Combined3, aes( Female_initiated_Divorce, Male_initiated_Divorce ))
g + geom_violin() + 
  labs(title="Violin plot", 
       subtitle="Female_initiated_Divorce   vs   Male_initiated_Divorce",
       caption="Source: abs.gov.au/ausstats/abs@.nsf/mf/3310.0",
       x="Female_initiated_Divorce",
       y="Male_initiated_Divorce")

Conclusion

Findings to date

To be be honest , I wish my data set was larger lenght wise, I almost want to go grab another data set and see if see if I can make a bigger , better, cool looking visual of something.