NCSU M.S. Analytics

Alumni Data Analysis

By Aaron Stearns

In this R Markdown document, I used the rvest package to scrape all of the available information from the North Carolina State University website on every student to have attended the NCSU M.S. Analytics program from 2008 to present.

The following visualizations have all been created using the plotly package. All of the plots are interactive, with information available on hover, zoom, etc.

Some of the plots are animated to show student trends over time. Simply press the ‘Play’ button to start each animation.

All of the code used in this project is available at the bottom of this page.

First plot:

This first animated interactive plot shows the percentage of students in each cohort year who held 1, 2, 3, or 4 degrees upon entering the M.S. Analytics program at NCSU

Second plot:

This interactive plot shows the number of students in each year of the M.S. Analytics program

Third plot:

This animated plot shows all unique degree types (B.S., MBA, PhD, etc.) held by students in each cohort year.

Fourth plot:

The following animated plot focuses on the percentage of students in each cohort year who hold a B.A. degree vs a B.S. degree.

As you can see, the percentage of students holding a BA degree always seems to hover between 10% and 25%, never breaking 30%

Fifth plot:

This animation shows the number of years that have elapsed between a student’s completion of their first higher education degree and the year they started the M.S. Analytics program, for each student in each cohort.

Sixth plot:

Now let’s see if there is a big difference between the median number of years of experience vs. the mean, by cohort.

Seventh plot:

This interactive boxplot shows the number of years students in each cohort year have taken between finishing a previous degree and entering the M.S. Analytics program.

Eighth plot:

And finally, this last interactive boxplot displays the number of years between finishing a first degree and finishing the degree previous to the M.S. Analytics program for students holding 2 or more degrees upon entry into the program.

library(rvest)
library(dplyr)
library(stringr)
library(tidyr)
library(qdapTools)
library(sqldf)
library(tm)
library(lubridate)
library(zoo)
library(plotly)
library(reshape2)
library(ggplot2)


twenty18 <- read_html("http://analytics.ncsu.edu/?page_id=243")
twenty17 <- read_html("http://analytics.ncsu.edu/?page_id=10211")
twenty16 <- read_html("http://analytics.ncsu.edu/?page_id=9443")
twenty15 <- read_html("http://analytics.ncsu.edu/?page_id=7607")
twenty14 <- read_html("http://analytics.ncsu.edu/?page_id=5469")
twenty13 <- read_html("http://analytics.ncsu.edu/?page_id=4564")
twenty12 <- read_html("http://analytics.ncsu.edu/?page_id=3259")
twenty11 <- read_html("http://analytics.ncsu.edu/?page_id=2738")
twenty10 <- read_html("http://analytics.ncsu.edu/?page_id=1553")
twenty09 <- read_html("http://analytics.ncsu.edu/?page_id=814")
twenty08 <- read_html("http://analytics.ncsu.edu/?page_id=222")

df_18_text <- twenty18 %>% html_nodes("td") %>% html_text()
df_17_text <- twenty17 %>% html_nodes("td") %>% html_text()
df_16_text <- twenty16 %>% html_nodes("td") %>% html_text()
df_15_text <- twenty15 %>% html_nodes("td") %>% html_text()
df_14_text <- twenty14 %>% html_nodes("td") %>% html_text()
df_13_text <- twenty13 %>% html_nodes("td") %>% html_text()
df_12_text <- twenty12 %>% html_nodes("td") %>% html_text()
df_11_text <- twenty11 %>% html_nodes("td") %>% html_text()
df_10_text <- twenty10 %>% html_nodes("td") %>% html_text()
df_09_text <- twenty09 %>% html_nodes("td") %>% html_text()
df_08_text <- twenty08 %>% html_nodes("td") %>% html_text()

df18 <- data.frame(df_18_text, stringsAsFactors = FALSE)
df17 <- data.frame(df_17_text, stringsAsFactors = FALSE)
df16 <- data.frame(df_16_text, stringsAsFactors = FALSE)
df15 <- data.frame(df_15_text, stringsAsFactors = FALSE)
df14 <- data.frame(df_14_text, stringsAsFactors = FALSE)
df13 <- data.frame(df_13_text, stringsAsFactors = FALSE)
df12 <- data.frame(df_12_text, stringsAsFactors = FALSE)
df11 <- data.frame(df_11_text, stringsAsFactors = FALSE)
df10 <- data.frame(df_10_text, stringsAsFactors = FALSE)
df09 <- data.frame(df_09_text, stringsAsFactors = FALSE)
df08 <- data.frame(df_08_text, stringsAsFactors = FALSE)

colnames(df08) <- "data"
colnames(df09) <- "data"
colnames(df10) <- "data"
colnames(df11) <- "data"
colnames(df12) <- "data"
colnames(df13) <- "data"
colnames(df14) <- "data"
colnames(df15) <- "data"
colnames(df16) <- "data"
colnames(df17) <- "data"
colnames(df18) <- "data"

df <- rbind(df08, df09, df10, df11, df12, df13, df14, df15, df16, df17, df18)

even <- seq_len(nrow(df)) %% 2 

df <- data.frame(x=df[!even, ])

df_18_names <- twenty18 %>% html_nodes("strong") %>% html_text()
df_17_names <- twenty17 %>% html_nodes("strong") %>% html_text()
df_16_names <- twenty16 %>% html_nodes("strong") %>% html_text()
df_15_names <- twenty15 %>% html_nodes("strong") %>% html_text()
df_14_names <- twenty14 %>% html_nodes("strong") %>% html_text()
df_13_names <- twenty13 %>% html_nodes("strong") %>% html_text()
df_12_names <- twenty12 %>% html_nodes("strong") %>% html_text()
df_11_names <- twenty11 %>% html_nodes("strong") %>% html_text()
df_10_names <- twenty10 %>% html_nodes("strong") %>% html_text()
df_09_names <- twenty09 %>% html_nodes("strong") %>% html_text()
df_08_names <- twenty08 %>% html_nodes("strong") %>% html_text()

dfn18 <- data.frame(df_18_names, stringsAsFactors = FALSE)
dfn17 <- data.frame(df_17_names, stringsAsFactors = FALSE)
dfn16 <- data.frame(df_16_names, stringsAsFactors = FALSE)
dfn15 <- data.frame(df_15_names, stringsAsFactors = FALSE)
dfn14 <- data.frame(df_14_names, stringsAsFactors = FALSE)
dfn13 <- data.frame(df_13_names, stringsAsFactors = FALSE)
dfn12 <- data.frame(df_12_names, stringsAsFactors = FALSE)
dfn11 <- data.frame(df_11_names, stringsAsFactors = FALSE)
dfn10 <- data.frame(df_10_names, stringsAsFactors = FALSE)
dfn09 <- data.frame(df_09_names, stringsAsFactors = FALSE)
dfn08 <- data.frame(df_08_names, stringsAsFactors = FALSE)

colnames(dfn18) <- "name"
colnames(dfn17) <- "name"
colnames(dfn16) <- "name"
colnames(dfn15) <- "name"
colnames(dfn14) <- "name"
colnames(dfn13) <- "name"
colnames(dfn12) <- "name"
colnames(dfn11) <- "name"
colnames(dfn10) <- "name"
colnames(dfn09) <- "name"
colnames(dfn08) <- "name"

dfn <- rbind(dfn08, dfn09, dfn10, dfn11, dfn12, dfn13, dfn14, dfn15, dfn16, dfn17, dfn18)

df <- cbind(dfn, df)

dfNames <- df$name

df$x <- str_replace(df$x, dfNames, "")

df <- df %>% rename(
  name = name,
  degrees = x
)

df$degrees <- gsub('[[:punct:]]+', '', df$degrees)
df$name <- gsub('[[:punct:]]+', '', df$name)


df <- separate(data = df, col = degrees, into = c("one", "two", "three", "four", "five"), sep = "(?<=\\d{4})")

df[is.na(df)] <- "none 0000"

# add "M.S., Analytics NCSU, 2018" column to 2018 class data
df2018 <- subset(df[644:754, ])

df2018$one1 <- "MS Analytics NC State University 2018"

df2018 <- df2018 %>% select(name, one1, one, two, three, four)

df2018 <- df2018 %>% rename(
  name = name, 
  one = one1,
  two = one,
  three = two,
  four = three,
  five = four
)

df <- subset(df[1:643, ])

df <- rbind(df, df2018)

df <- df %>% select(name, one, two, three, four, five)

deg1 <- separate(data = df, col = one, into = c("deg1", "date1"), sep = "(\\d{4})")
deg2 <- separate(data = df, col = two, into = c("deg2", "date2"), sep = "(\\d{4})")
deg3 <- separate(data = df, col = three, into = c("deg3", "date3"), sep = "(\\d{4})")
deg4 <- separate(data = df, col = four, into = c("deg4", "date4"), sep = "(\\d{4})")
deg5 <- separate(data = df, col = five, into = c("deg5", "date5"), sep = "(\\d{4})")

df$year1 <- str_replace(df$one, deg1$deg1, "")
df$year2 <- str_replace(df$two, deg2$deg2, "")
df$year3 <- str_replace(df$three, deg3$deg3, "")
df$year4 <- str_replace(df$four, deg4$deg4, "")
df$year5 <- str_replace(df$five, deg5$deg5, "")

df$year1[is.na(df$year1)] <- "0000"
df$year2[is.na(df$year2)] <- "0000"
df$year3[is.na(df$year3)] <- "0000"
df$year4[is.na(df$year4)] <- "0000"
df$year5[is.na(df$year5)] <- "0000"

year_cln <- function(x) {
  x <- gsub('[a-zA-Z]+', '', x)
  x <- gsub('[[:punct:]]+', '', x)
}

df$year1 <- year_cln(df$year1)
df$year2 <- year_cln(df$year2)
df$year3 <- year_cln(df$year3)
df$year4 <- year_cln(df$year4)
df$year5 <- year_cln(df$year5)

df$one <- gsub('[0-9]+', '', df$one)
df$two <- gsub('[0-9]+', '', df$two)
df$three <- gsub('[0-9]+', '', df$three)
df$four <- gsub('[0-9]+', '', df$four)
df$five <- gsub('[0-9]+', '', df$five)

df$year1 <- as.numeric(df$year1)
df$year2 <- as.numeric(df$year2)
df$year3 <- as.numeric(df$year3)
df$year4 <- as.numeric(df$year4)
df$year5 <- as.numeric(df$year5)

df$year1[is.na(df$year1)] <- 0
df$year2[is.na(df$year2)] <- 0
df$year3[is.na(df$year3)] <- 0
df$year4[is.na(df$year4)] <- 0
df$year5[is.na(df$year5)] <- 0

# remove 2 improperly formatted rows / dropouts from the program
df <- df[-c(6, 543), ]

# found 3 people who didn't graduate
noGrad <- sqldf("SELECT * FROM df WHERE NOT one LIKE '%MS Anal%'")
noGrad <- noGrad[c(1, 2, 4), ]

# Shaina Race had her PhD in the MS Analytics field - switch them
df[305, 2] <- "MS Analytics NC State University"
df[305, 3] <- "PhD Operations Research NC State University"

# anti_join to remove non-MS Analytics grads
df <- df %>% anti_join(noGrad)

df[, 2] <- "MS Analytics NC State University"

# create function to split diploma type and school name
titleR <- function(x, a) {
  x <- as.data.frame(str_split_fixed(x, " ", 2))
  colnames(x) <- paste0(c("title", "school"), a)
  x
}

y1 <- titleR(df$one, 1)
y2 <- titleR(df$two, 2)
y3 <- titleR(df$three, 3)
y4 <- titleR(df$four, 4)
y5 <- titleR(df$five, 5)

df <- cbind(df$name, y1, df$year1, y2, df$year2, y3, df$year3, y4, df$year4, y5, df$year5)

colnames(df) <- c("name", "title1", "school1", "year1", "title2", "school2", "year2", "title3", "school3", "year3", "title4", "school4", "year4", "title5", "school5", "year5")

titles <- data.frame(df$name, df$year1, y1$title1, y2$title2, y3$title3, y4$title4, y5$title5)

########################

# create year lag column
# years between each degree
df$lag1 <- ifelse(df$year2 > 0, df$year1 - df$year2, 0)
df$lag2 <- ifelse(df$year3 > 0, df$year1 - df$year3, 0)
df$lag3 <- ifelse(df$year4 > 0, df$year1 - df$year4, 0)
df$lag4 <- ifelse(df$year5 > 0, df$year1 - df$year5, 0)

# create 'spread' column
# spread from 1st degree to masters degree - total time
df$spread4 <- ifelse(df$lag4 > 0, df$lag4, 0)
df$spread3 <- ifelse(df$lag3 > 0 & df$lag4 == 0,df$lag3, 0)
df$spread2 <- ifelse(df$lag2 > 0 & df$lag3 == 0, df$lag2, 0)
df$spread1 <- ifelse(df$lag1 > 0 & df$lag2 == 0, df$lag1, 0)

df <- df %>% mutate(spread = spread1 + spread2 + spread3 + spread4)

df <- df %>% mutate(spreadAdjusted = spread - 1)

# for those with more than 1 degree going into the MS Analy... program
df$eduSpread1 <- ifelse(df$year2 > 0, df$year2 - df$year2, 0)
df$eduSpread2 <- ifelse(df$year3 > 0, df$year2 - df$year3, 0)
df$eduSpread3 <- ifelse(df$year4 > 0, df$year2 - df$year4, 0)
df$eduSpread4 <- ifelse(df$year5 > 0, df$year2 - df$year5, 0)

df$hEd0 <- df$eduSpread1
df$hEd1 <- ifelse(df$eduSpread2 > 0 & df$eduSpread3 == 0, df$eduSpread2, 0)
df$hEd2 <- ifelse(df$eduSpread3 > 0 & df$eduSpread4 == 0, df$eduSpread3, 0)
df$hEd3 <- ifelse(df$eduSpread4 > 0, df$eduSpread4, 0)

df <- df %>% mutate(hEdSpread = hEd0 + hEd1 + hEd2 + hEd3)

df <- df %>% mutate(mastersLag = spreadAdjusted - hEdSpread)

# export .csv for visualizations in markdown doc
write.csv(df, file = "df.csv")

################################################################
#########################PLOTLY VISUALIZATIONS##################
################################################################


df <- read.csv("df.csv")
distinct <- read.csv("distinct.csv")


# boxplot all years
p1 <- plot_ly(df, x = ~year1, y = ~mastersLag, color = I("darkblue"),alpha = 0.7,marker = list(color = 'rgb(204,204,204)'), type = "box") %>% 
  layout(title = "Number of years between finishing previous \ndegree and entering M.S. Analytics, by cohort",
         xaxis = list(title = ""),
         yaxis = list(title = "# of years"))


# boxplot spread of higher education years for all degrees except MS Analytics
p2 <- plot_ly(df, x = ~year1, y = ~hEdSpread, color = I("black"), marker = list(color = 'rgb(204,204,204)'), type = "box") %>% 
  layout(title = "Number of years between finishing first degree \nand degree previous to M.S. Analytics, by cohort",
         xaxis = list(title = ""),
         yaxis = list(title = "# of years"))

p3 <- df %>% group_by(year1) %>% summarise(mean = mean(spreadAdjusted)) %>% plot_ly(x = ~year1, y = ~mean, type = "bar", name = "mean")


# means and trimmed means

means <- df %>% group_by(year1) %>% summarise(mean = mean(spreadAdjusted)) 

trimmedMeans <- df %>% group_by(year1) %>% summarise(mean = mean(spreadAdjusted, trim = 0.10)) 

p4 <- df %>% group_by(year1) %>% summarise(mean = mean(spreadAdjusted)) %>% plot_ly(x = ~year1, y = ~mean,    color = I("lightblue"), type = "bar", name = "Mean") %>% 
  add_trace(y = ~trimmedMeans$mean, name = "Trimmed Mean", color = I("darkblue")) %>% 
  layout(title = "Mean and trimmed mean (10% trimmed) \nnumber of years since first degree",
         yaxis = list(title = "# of years"), 
         xaxis = list(title = ""), 
         barmode = 'group')

# means and medians  
p5 <- df %>% group_by(year1) %>% summarise(med = median(spreadAdjusted)) %>% plot_ly(x = ~year1, y = ~med, color = I("red"), type = "bar", name = "median") %>% 
  add_trace(y = ~means$mean, color = I("orange"), name = "mean") %>%
  layout(title = "Mean and median number of years \nsince first degree, by cohort", 
         yaxis = list(title = "# of years"), 
         xaxis = list(title = ""), 
         barmode = 'group')

p6 <- df %>% group_by(year1) %>% summarise(rangeIQR = IQR(mastersLag)) %>% plot_ly(x = ~year1, y = ~rangeIQR, type = "bar")

pFacet <- ggplot(df, aes(x = mastersLag)) + geom_histogram() + facet_wrap(~year1, ncol = 3)


tots <- df %>%  group_by(year1, spreadAdjusted) %>% summarise(tots = n())

pA <- tots %>% 
  plot_ly(
    x = ~spreadAdjusted,
    y = ~tots,
    frame = ~year1,
    color = I("red"),
    type = 'bar',
    showlegend = F
  ) %>% 
  layout(title = "Years since finishing first degree, by cohort",
         xaxis = list(title = "# of years since first degree"),
         yaxis = list(title = "# of students"))%>% 
  animation_opts(
    frame = 1500, 
    redraw = TRUE
  )



pD <- plot_ly(distinct, x = ~year, y = ~cohTot, color = I("red"), alpha = 0.6, type = "bar") %>% 
  layout(title = "Cohort size, by year",
         xaxis = list(title = ""),
         yaxis = list(title = "# of students"))

df$title2 <-str_replace_all(df$title2, "\n", "")
df$title3 <-str_replace_all(df$title3, "\n", "")
df$title4 <-str_replace_all(df$title4, "\n", "")
df$title5 <-str_replace_all(df$title5, "\n", "")

t2 <- df %>% select(year1, title2) 
colnames(t2) <- c("year", "title")
t3 <- df %>% select(year1, title3)
colnames(t3) <- c("year", "title")
t4 <- df %>% select(year1, title4)
colnames(t4) <- c("year", "title")
t5 <- df %>% select(year1, title5)
colnames(t5) <- c("year", "title")

t1 <- rbind(t2, t3, t4, t5)

t1 <- t1[t1$title!="" & t1$title!="none", ]
t1 <- t1 %>% arrange(year) %>% group_by(year, title) %>% summarise(tots = n())

t01 <- t1 %>% filter(title == "BA" | title == "BS")

t01 <- t01 %>% mutate(yrTot = case_when(
  year == 2008 ~21,
  year == 2009 ~35,
  year == 2010 ~38,
  year == 2011 ~39,
  year == 2012 ~37,
  year == 2013 ~79,
  year == 2014 ~78,
  year == 2015 ~83,
  year == 2016 ~112,
  year == 2017 ~116,
  year == 2018 ~111)
) 

t01 <- t01 %>% mutate(degPerc = round(tots / yrTot, digits = 2))
t01 <- t01 %>% mutate(degCien = degPerc * 100)


pt1 <- t01 %>% 
  plot_ly(
    x = ~title,
    y = ~degCien,
    frame = ~year,
    color = ~title,
    type = 'bar',
    showlegend = F
  ) %>% 
  layout(title = "B.S. vs. B.A.",
         xaxis = list(title = "", tickangle = 85),
         yaxis = list(title = "% of students")) %>% 
  animation_opts(
    frame = 1500, 
    redraw = TRUE
)

pd1 <- t1 %>% 
  plot_ly(
    x = ~title,
    y = ~tots,
    frame = ~year,
    color = ~title,
    type = 'bar',
    showlegend = T
  ) %>% 
  layout(title = "Types of degrees held by students, by cohort",
         xaxis = list(title = "", tickangle = 45),
         yaxis = list(title = "# of students"))%>% 
  animation_opts(
    frame = 1500, 
    redraw = TRUE
  )

df$degFour <- ifelse(df$year5 > 0, 4, 0)
df$degThree <- ifelse(df$year4 > 0 & df$year5 < 1800, 3, 0)
df$degTwo <- ifelse(df$year3 > 0 & df$year4 < 1800, 2, 0)
df$degOne <- ifelse(df$year2 > 0 & df$year3 < 1800, 1, 0)

df <- df %>% mutate(degTotal = degOne + degTwo + degThree + degFour)

tots1 <- df %>% group_by(year1, degTotal) %>% summarise(tots = n()) %>% mutate(yrTot = case_when(
  year1 == 2008 ~21,
  year1 == 2009 ~35,
  year1 == 2010 ~38,
  year1 == 2011 ~39,
  year1 == 2012 ~37,
  year1 == 2013 ~79,
  year1 == 2014 ~78,
  year1 == 2015 ~83,
  year1 == 2016 ~112,
  year1 == 2017 ~116,
  year1 == 2018 ~111)
) 

tots1 <- tots1 %>% mutate(degPerc = round(tots / yrTot, digits = 2))
tots1 <- tots1 %>% mutate(degCien = degPerc * 100)
pP1 <- tots1 %>% 
  plot_ly(
    x = ~degTotal,
    y = ~degCien,
    frame = ~year1,
    color = I("red"),
    marker = list(opacity = 0.7, sizemode = 'diameter'),
    size = ~tots,
    sizes = c(30, 100),
    type = 'scatter',
    mode = 'markers',
    showlegend = F
  ) %>% 
  layout(title = "Number of degrees completed before \nentering M.S. Analytics, by cohort",
         xaxis = list(title = "# of degrees", showgrid = FALSE),
         yaxis = list(title = "% of students", showgrid = FALSE))%>% 
  animation_opts(
    frame = 1700, 
    redraw = TRUE
  )