title: "Analysis of wage data of 3,000 male workers in Mid Atlantic Region from 2003-2009" author: Elizabeth Healy date: "r format(Sys.time(), '%d %B, %Y')" output: flexdashboard::flexdashboard: toc: true theme: flatly orientation: rows social: "menu" sourcecode: embed runtime: shiny ---

Information description {.sidebar}

This dataset contains information on age, marital status, race, level of education, health, health insurance coverage and wages for 3,000 male workers based in the Mid Atlantic region of the USA. The data was recorded from 2003 to 2009.

{r echo = FALSE} selectInput(inputId = "n_bins", label = "Number of bins:", choices = c(10, 20, 35, 50), selected = 20)

{r install-packages, include = FALSE} install.packages("skimr") #install the skimr package install.packages("plotly") #install plotly install.packages('rsconnect')

{r, include=FALSE} rsconnect::setAccountInfo(name='elizabethhealy', token='CA2EC647FBED6EDDE25F25FADCDBEB0E', secret='f0C37PBirywaefGoYO5wOsayNQlkiyP9o/Cb700W')

{r load-packages, include = FALSE} library(tidyverse) library(skimr) #load the skimr package library(plotly) #load plotly library(flexdashboard) #load flexdashboard library(DT) #load DT

{r load-data, include = FALSE} read_csv(file = "C:/Users/lizmh/OneDrive/Desktop/MA5021/E-tivity3/wage_clean.csv") #read in wage_clean.csv dataset

{r, include = FALSE} wage_clean <- read_csv("wage_clean.csv") #store wage_clean dataset in a tibble

{r, include=FALSE} wage_clean%>% count(maritl)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of marital status of respondents, by number, proportion and % {r, include = FALSE} wage_clean_tbl_1 <- wage_clean%>% count(maritl)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of marital status

Page 1

row

Number of workers who identify as White (/3,000)

{r} valueBox(value=2479, icon="fa-users")

Number of Workers with Advanced Degrees (/3,000)

{r} valueBox(value=426, icon = "fa-user-graduate")

Number of workers with health insurance (/3,000)

{r} gauge(value = 2083, min=0, max=3000, sectors = gaugeSectors(success = c(2500, 3000), warning = c(2000, 2500), danger = c(0, 2000) ) )

row

ggplots: Breakdown of Education Levels (out of 100%)

{r, echo=FALSE} wage_clean_tbl_1%>% ggplot(aes(x=maritl, y=pct, fill=maritl))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Marital Status") + ylab("Percent(%)")+ theme_bw() + ggtitle("Marital Status of Workers (%)")+ theme(legend.position = "none") #Create ggplot based on table of counts of marital status

{r, include=FALSE} wage_clean%>% count(race)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of race of respondents, by number, proportion and % {r, include = FALSE} wage_clean_tbl_2 <- wage_clean%>% count(race)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of race

ggplots: Breakdown of Race (out of 100%)

{r, echo=FALSE} wage_clean_tbl_2%>% ggplot(aes(x=race, y=pct, fill=race))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Racial Status") + ylab("Percent(%)")+ theme_bw() + ggtitle("Race of Workers (%)")+ theme(legend.position = "none") #Create ggplot based on table of counts of racial status

{r, include=FALSE} wage_clean%>% count(education)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of education of respondents, by number, proportion and %

{r, include = FALSE} wage_clean_tbl_3 <- wage_clean%>% count(education)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts - levels of education

ggplots: Breakdown of Education Levels (out of 100%)

{r, echo=FALSE} wage_clean_tbl_3%>% ggplot(aes(x=education, y=pct, fill=education))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Education Levels") + ylab("Percent(%)")+ theme_bw() + ggtitle("Levels of Education of Workers (%)")+ theme(legend.position = "none") #Create ggplot based on table of counts - levels of education

{r, include=FALSE} wage_clean%>% count(jobclass)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of jobclass (industrial V information) of respondents, by number, proportion and %

{r, include = FALSE} wage_clean_tbl_4 <- wage_clean%>% count(jobclass)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of jobclass

row

ggplots: Breakdown of Jobclass (out of 100%)

{r, echo=FALSE} wage_clean_tbl_4%>% ggplot(aes(x=jobclass, y=pct, fill=jobclass))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Jobclass: Industry v Information") + ylab("Percent(%)")+ ggtitle("% of workers in Industrial V Information jobs")+ theme_bw() + theme(legend.position = "none") #Create ggplot based on table of counts of Jobclass

{r, include=FALSE} wage_clean%>% count(health)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of health quality (Good V Very Good)of respondents, by number, proportion and %

{r, include = FALSE} wage_clean_tbl_5 <- wage_clean%>% count(health)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of health status

ggplots: Breakdown of Workers' Health (out of 100%)

{r, echo=FALSE} wage_clean_tbl_5%>% ggplot(aes(x=health, y=pct, fill=health))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Quality of Health of Workers") + ylab("Percent(%)") + ggtitle("Quality of Health of Workers (%)")+ theme_bw() + theme(legend.position = "none") #Create ggplot based on table of counts of Health Status

{r, include=FALSE} wage_clean%>% count(health_ins)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of those who have health insurance, by number, proportion and %

{r, include = FALSE} wage_clean_tbl_6 <- wage_clean%>% count(health_ins)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of health insurance status

ggplots: Breakdown of % Workers with Health Insurance

{r, echo=FALSE} wage_clean_tbl_6%>% ggplot(aes(x=health_ins, y=pct, fill=health_ins))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Health Insurance Status") + ylab("Percent(%)")+ theme_bw()+ ggtitle("% of Workers with Health Insurance")+ theme(legend.position = "none") #Create ggplot based on table of counts of Health Insurance Status

{r, include = FALSE} fig2 <- wage_clean%>% ggplot(aes(x=wage)) + geom_histogram(fill='#006633',colour="black", bins = 20)+ xlab("wage")+ ggtitle("Count of workers on various Raw Wage levels") +theme_bw() #Create a histogram of raw wage data

Page 2

row

Html Widgets: Interactive Histogram

{r, echo=FALSE} ggplotly(fig2) #Create an interactive ggplot based on above

row

Reactive Histogram using Shiny

{r, echo=FALSE} renderPlot({wage_clean%>% ggplot(aes(x=wage)) + geom_histogram(fill='#30686f',colour="black", bins = as.numeric(input$n_bins))+ xlab("wage")+ ggtitle("Count of workers on various Raw Wage levels") +theme_bw()})

{r, include = FALSE} five.num.sum.wage <- wage_clean%>% summarise(Q1 = quantile(wage, prob=0.25, na.rm=TRUE), q2 = median(wage, na.rm=TRUE), Q3 = quantile(wage, prob=0.75, na.rm=TRUE), min= min(wage, na.rm = TRUE), max = max(wage, na.rm = TRUE)) #Create a boxplot of raw wage data by calculating the five number summary of raw wage data - Q1, Q2, Q3, min and max

{r, include=FALSE} five.num.sum.wage #store five.num.sum.wage tibble five.num.sum.wage$Q1-1.5*(five.num.sum.wage$Q3-five.num.sum.wage$Q1) #Calculate Lower fence five.num.sum.wage$Q3+1.5*(five.num.sum.wage$Q3-five.num.sum.wage$Q1) #Calculate Upper fence fig4 <- wage_clean%>% ggplot(aes(y=wage))+geom_boxplot(fill='#006633', colour="black")+ylab("Raw wage")+ggtitle("Median and IQR of Raw Wage data")+theme_bw() #Store Boxplot of raw wage data into fig4

Html Widgets: Interactive Boxplot

{r, echo=FALSE} ggplotly(fig4) #Create an interactive ggplot based on above

{r, include = FALSE} wage_clean%>% summarise(median_wage=median(wage, na.rm = TRUE)) #Calculate the median of wage variable data

{r, include = FALSE} wage_clean%>% summarise(IQR_wage=IQR(wage, na.rm = TRUE)) #Calculate the IQR of wage variable data

Page 3

row

Table: Median & IQR wage based on Education Level

{r, echo=FALSE} wage_clean%>% group_by(education)%>% summarise(median_wage=median(wage, na.rm=TRUE), IQR_wage=IQR(wage, na.rm=TRUE)) #Calculate median and IQR of raw wage data grouped by education Based on the above data, workers with advanced degrees earn a much higher median raw wage than other workers (142 Versus the next highest median wage of 119, belonging to workers who are College Graduates). There is a clear correlation between a higher raw wage (based on median and IQR values) and the worker's education level. This is further emphasised in the grouped boxplot.

Html widget: Interactive Grouped Boxplot (Wages grouped by Education Level)

{r, echo=FALSE} fig8 <- wage_clean%>% ggplot(aes(y=wage, x=education, fill=education))+ geom_boxplot()+ ylab("Raw wage")+ xlab("Education Level") + theme_bw()+ theme(legend.position ="none")+ ggtitle("Median & IQR Wage based on Education Level") #Create grouped boxplots - wage data grouped by education level ggplotly(fig8) #Make grouped boxplot interactive using ggplotly

row

Table: Median & IQR wage based on Race

{r, echo=FALSE} wage_clean%>% group_by(race)%>% summarise(median_wage=median(wage, na.rm=TRUE), IQR_wage=IQR(wage, na.rm=TRUE)) #Calculate median and IQR of raw wage data grouped by race Based on the above data, Asian workers earn a higher median raw wage than all other ethnic types (114 Versus the next highest median wage of 106). This is despite the fact that White workers make up over 80% of the worker population. The largest median wage belongs to the NA group of which we have no data. There are 9 NA values in the raw wage dataset. With a median wage far in excess of the overall median and the next highest median value based on race, further analysis is required.

Html widget: Interactive Grouped Boxplot (Wages grouped by Race)

{r, echo=FALSE} fig9 <- wage_clean%>% ggplot(aes(y=wage, x=race, fill=race))+ geom_boxplot()+ ylab("Raw wage")+ xlab("Race")+ theme_bw()+ theme(legend.position = "none")+ggtitle("Median & IQR raw wage data based on race") #Create grouped boxplots - wage data grouped by race ggplotly(fig9) #Make grouped boxplot interactive using ggplotly

row

Table: Median & IQR wage based on Jobclass

{r, echo=FALSE} wage_clean%>% group_by(jobclass)%>% summarise(median_wage=median(wage, na.rm=TRUE), IQR_wage=IQR(wage, na.rm=TRUE)) #Calculate median and IQR of raw wage data grouped by jobclass Based on the median and IQR values of Industrial and Information workers' raw wages, information workers appear to earn more with a slightly higher median raw wage value of 113 Versus 100 and an IQR wage value of 46 Versus 38.

Html widget: Interactive Grouped Boxplot (Wages grouped by Jobclass)

{r, echo=FALSE} fig10 <- wage_clean%>% ggplot(aes(y=wage, x=jobclass, fill=jobclass))+ geom_boxplot()+ ylab("Raw wage")+ xlab("Jobclass: Industrial V Information")+ theme_bw()+ theme(legend.position = "none")+ ggtitle("Median & IQR raw wage data based on jobclass") #Create grouped boxplots - wage data grouped by jobclass ggplotly(fig10) #Make grouped boxplot interactive using ggplotly