title: "Analysis of wage data of 3,000 male workers in Mid Atlantic Region from 2003-2009" author: Elizabeth Healy date: "r format(Sys.time(), '%d %B, %Y')" output: flexdashboard::flexdashboard: toc: true theme: flatly orientation: rows social: "menu" sourcecode: embed runtime: shiny ---
This dataset contains information on age, marital status, race, level of education, health, health insurance coverage and wages for 3,000 male workers based in the Mid Atlantic region of the USA. The data was recorded from 2003 to 2009.
{r echo = FALSE} selectInput(inputId = "n_bins", label = "Number of bins:", choices = c(10, 20, 35, 50), selected = 20)
{r install-packages, include = FALSE} install.packages("skimr") #install the skimr package install.packages("plotly") #install plotly install.packages('rsconnect')
{r, include=FALSE} rsconnect::setAccountInfo(name='elizabethhealy', token='CA2EC647FBED6EDDE25F25FADCDBEB0E', secret='f0C37PBirywaefGoYO5wOsayNQlkiyP9o/Cb700W')
{r load-packages, include = FALSE} library(tidyverse) library(skimr) #load the skimr package library(plotly) #load plotly library(flexdashboard) #load flexdashboard library(DT) #load DT
{r load-data, include = FALSE} read_csv(file = "C:/Users/lizmh/OneDrive/Desktop/MA5021/E-tivity3/wage_clean.csv") #read in wage_clean.csv dataset
{r, include = FALSE} wage_clean <- read_csv("wage_clean.csv") #store wage_clean dataset in a tibble
{r, include=FALSE} wage_clean%>% count(maritl)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of marital status of respondents, by number, proportion and % {r, include = FALSE} wage_clean_tbl_1 <- wage_clean%>% count(maritl)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of marital status
{r} valueBox(value=2479, icon="fa-users")
{r} valueBox(value=426, icon = "fa-user-graduate")
{r} gauge(value = 2083, min=0, max=3000, sectors = gaugeSectors(success = c(2500, 3000), warning = c(2000, 2500), danger = c(0, 2000) ) )
{r, echo=FALSE} wage_clean_tbl_1%>% ggplot(aes(x=maritl, y=pct, fill=maritl))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Marital Status") + ylab("Percent(%)")+ theme_bw() + ggtitle("Marital Status of Workers (%)")+ theme(legend.position = "none") #Create ggplot based on table of counts of marital status
{r, include=FALSE} wage_clean%>% count(race)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of race of respondents, by number, proportion and % {r, include = FALSE} wage_clean_tbl_2 <- wage_clean%>% count(race)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of race
{r, echo=FALSE} wage_clean_tbl_2%>% ggplot(aes(x=race, y=pct, fill=race))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Racial Status") + ylab("Percent(%)")+ theme_bw() + ggtitle("Race of Workers (%)")+ theme(legend.position = "none") #Create ggplot based on table of counts of racial status
{r, include=FALSE} wage_clean%>% count(education)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of education of respondents, by number, proportion and %
{r, include = FALSE} wage_clean_tbl_3 <- wage_clean%>% count(education)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts - levels of education
{r, echo=FALSE} wage_clean_tbl_3%>% ggplot(aes(x=education, y=pct, fill=education))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Education Levels") + ylab("Percent(%)")+ theme_bw() + ggtitle("Levels of Education of Workers (%)")+ theme(legend.position = "none") #Create ggplot based on table of counts - levels of education
{r, include=FALSE} wage_clean%>% count(jobclass)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of jobclass (industrial V information) of respondents, by number, proportion and %
{r, include = FALSE} wage_clean_tbl_4 <- wage_clean%>% count(jobclass)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of jobclass
{r, echo=FALSE} wage_clean_tbl_4%>% ggplot(aes(x=jobclass, y=pct, fill=jobclass))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Jobclass: Industry v Information") + ylab("Percent(%)")+ ggtitle("% of workers in Industrial V Information jobs")+ theme_bw() + theme(legend.position = "none") #Create ggplot based on table of counts of Jobclass
{r, include=FALSE} wage_clean%>% count(health)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of health quality (Good V Very Good)of respondents, by number, proportion and %
{r, include = FALSE} wage_clean_tbl_5 <- wage_clean%>% count(health)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of health status
{r, echo=FALSE} wage_clean_tbl_5%>% ggplot(aes(x=health, y=pct, fill=health))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Quality of Health of Workers") + ylab("Percent(%)") + ggtitle("Quality of Health of Workers (%)")+ theme_bw() + theme(legend.position = "none") #Create ggplot based on table of counts of Health Status
{r, include=FALSE} wage_clean%>% count(health_ins)%>% mutate(prop = n/sum(n), pct=prop*100) #Create a tibble which provides a breakdown of those who have health insurance, by number, proportion and %
{r, include = FALSE} wage_clean_tbl_6 <- wage_clean%>% count(health_ins)%>% mutate(prop = n/sum(n), pct=prop*100) #table of counts of health insurance status
{r, echo=FALSE} wage_clean_tbl_6%>% ggplot(aes(x=health_ins, y=pct, fill=health_ins))+ geom_bar(colour = "black", stat = 'identity')+ ylim(c(0,100))+xlab("Health Insurance Status") + ylab("Percent(%)")+ theme_bw()+ ggtitle("% of Workers with Health Insurance")+ theme(legend.position = "none") #Create ggplot based on table of counts of Health Insurance Status
{r, include = FALSE} fig2 <- wage_clean%>% ggplot(aes(x=wage)) + geom_histogram(fill='#006633',colour="black", bins = 20)+ xlab("wage")+ ggtitle("Count of workers on various Raw Wage levels") +theme_bw() #Create a histogram of raw wage data
{r, echo=FALSE} ggplotly(fig2) #Create an interactive ggplot based on above
{r, echo=FALSE} renderPlot({wage_clean%>% ggplot(aes(x=wage)) + geom_histogram(fill='#30686f',colour="black", bins = as.numeric(input$n_bins))+ xlab("wage")+ ggtitle("Count of workers on various Raw Wage levels") +theme_bw()})
{r, include = FALSE} five.num.sum.wage <- wage_clean%>% summarise(Q1 = quantile(wage, prob=0.25, na.rm=TRUE), q2 = median(wage, na.rm=TRUE), Q3 = quantile(wage, prob=0.75, na.rm=TRUE), min= min(wage, na.rm = TRUE), max = max(wage, na.rm = TRUE)) #Create a boxplot of raw wage data by calculating the five number summary of raw wage data - Q1, Q2, Q3, min and max
{r, include=FALSE} five.num.sum.wage #store five.num.sum.wage tibble five.num.sum.wage$Q1-1.5*(five.num.sum.wage$Q3-five.num.sum.wage$Q1) #Calculate Lower fence five.num.sum.wage$Q3+1.5*(five.num.sum.wage$Q3-five.num.sum.wage$Q1) #Calculate Upper fence fig4 <- wage_clean%>% ggplot(aes(y=wage))+geom_boxplot(fill='#006633', colour="black")+ylab("Raw wage")+ggtitle("Median and IQR of Raw Wage data")+theme_bw() #Store Boxplot of raw wage data into fig4
{r, echo=FALSE} ggplotly(fig4) #Create an interactive ggplot based on above
{r, include = FALSE} wage_clean%>% summarise(median_wage=median(wage, na.rm = TRUE)) #Calculate the median of wage variable data
{r, include = FALSE} wage_clean%>% summarise(IQR_wage=IQR(wage, na.rm = TRUE)) #Calculate the IQR of wage variable data
{r, echo=FALSE} wage_clean%>% group_by(education)%>% summarise(median_wage=median(wage, na.rm=TRUE), IQR_wage=IQR(wage, na.rm=TRUE)) #Calculate median and IQR of raw wage data grouped by education Based on the above data, workers with advanced degrees earn a much higher median raw wage than other workers (142 Versus the next highest median wage of 119, belonging to workers who are College Graduates). There is a clear correlation between a higher raw wage (based on median and IQR values) and the worker's education level. This is further emphasised in the grouped boxplot.
{r, echo=FALSE} fig8 <- wage_clean%>% ggplot(aes(y=wage, x=education, fill=education))+ geom_boxplot()+ ylab("Raw wage")+ xlab("Education Level") + theme_bw()+ theme(legend.position ="none")+ ggtitle("Median & IQR Wage based on Education Level") #Create grouped boxplots - wage data grouped by education level ggplotly(fig8) #Make grouped boxplot interactive using ggplotly
{r, echo=FALSE} wage_clean%>% group_by(race)%>% summarise(median_wage=median(wage, na.rm=TRUE), IQR_wage=IQR(wage, na.rm=TRUE)) #Calculate median and IQR of raw wage data grouped by race Based on the above data, Asian workers earn a higher median raw wage than all other ethnic types (114 Versus the next highest median wage of 106). This is despite the fact that White workers make up over 80% of the worker population. The largest median wage belongs to the NA group of which we have no data. There are 9 NA values in the raw wage dataset. With a median wage far in excess of the overall median and the next highest median value based on race, further analysis is required.
{r, echo=FALSE} fig9 <- wage_clean%>% ggplot(aes(y=wage, x=race, fill=race))+ geom_boxplot()+ ylab("Raw wage")+ xlab("Race")+ theme_bw()+ theme(legend.position = "none")+ggtitle("Median & IQR raw wage data based on race") #Create grouped boxplots - wage data grouped by race ggplotly(fig9) #Make grouped boxplot interactive using ggplotly
{r, echo=FALSE} wage_clean%>% group_by(jobclass)%>% summarise(median_wage=median(wage, na.rm=TRUE), IQR_wage=IQR(wage, na.rm=TRUE)) #Calculate median and IQR of raw wage data grouped by jobclass Based on the median and IQR values of Industrial and Information workers' raw wages, information workers appear to earn more with a slightly higher median raw wage value of 113 Versus 100 and an IQR wage value of 46 Versus 38.
{r, echo=FALSE} fig10 <- wage_clean%>% ggplot(aes(y=wage, x=jobclass, fill=jobclass))+ geom_boxplot()+ ylab("Raw wage")+ xlab("Jobclass: Industrial V Information")+ theme_bw()+ theme(legend.position = "none")+ ggtitle("Median & IQR raw wage data based on jobclass") #Create grouped boxplots - wage data grouped by jobclass ggplotly(fig10) #Make grouped boxplot interactive using ggplotly