R - Final Project

Do Four Year Graduates From Mid Atlantic Colleges Earn 10% or More Than Two Year Graduates?

RPub link : https://rpubs.com/cliftonleesps/r_week3_final

# 5. BONUS – place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

data_frame = read.table(file="https://raw.githubusercontent.com/cliftonleesps/dataset/main/scorecard.csv", header=TRUE,sep=",")

# Remove a few unecessary columns
data_frame <- subset(data_frame, select = c(inst_name, state_abbr, pred_degree_awarded_ipeds, earnings_med, count_not_working, count_working))

# 2. Data wrangling: Please perform some basic transformations. They will need to make sense but could include column renaming, creating a subset of the data, replacing values, or creating new columns with derived data (for example – if it makes sense you could sum two columns together)

# create two subsets for two and four year colleges in the mid Atlantic states
mid_atlantic_2 <- subset(data_frame,  str_detect(state_abbr, "(NY|NJ|DE|MD|DC|VA|WV)") & pred_degree_awarded_ipeds == 2 )

mid_atlantic_4 <- subset(data_frame,  str_detect(state_abbr, "(NY|NJ|DE|MD|DC|VA|WV)") & pred_degree_awarded_ipeds == 3 )

# remove the ped_degree_adwarded_ipeds
mid_atlantic_2 <- subset(mid_atlantic_2, select = c(inst_name, state_abbr, earnings_med, count_not_working, count_working))

mid_atlantic_4 <- subset(mid_atlantic_4, select = c(inst_name, state_abbr, earnings_med, count_not_working, count_working))


# add a new column for percent working from count_working and count_not_working
mid_atlantic_2$percent_working <- round(mid_atlantic_2$count_working / (mid_atlantic_2$count_not_working + mid_atlantic_2$count_working) * 100)

mid_atlantic_4$percent_working <- round(mid_atlantic_4$count_working / (mid_atlantic_4$count_not_working + mid_atlantic_4$count_working) * 100)

# Replace some values with string replacements
# Replace Community College with 'CC'
mid_atlantic_2$inst_name <- str_replace(string=mid_atlantic_2$inst_name, pattern="Community College", replacement="CC")
mid_atlantic_4$inst_name <- str_replace(string=mid_atlantic_4$inst_name, pattern="Community College", replacement="CC")

# Replace College with 'Col.'
mid_atlantic_2$inst_name <- str_replace(string=mid_atlantic_2$inst_name, pattern="College", replacement="Col.")
mid_atlantic_4$inst_name <- str_replace(string=mid_atlantic_4$inst_name, pattern="College", replacement="Col.")

# Replace University with 'Uni.'
mid_atlantic_2$inst_name <- str_replace(string=mid_atlantic_2$inst_name, pattern="University", replacement="Uni.")
mid_atlantic_4$inst_name <- str_replace(string=mid_atlantic_4$inst_name, pattern="University", replacement="Uni.")


# 1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.

print("Summary Statistics for Two Year College Graduates")

## [1] "Summary Statistics for Two Year College Graduates"

summary(mid_atlantic_2)

##   inst_name          state_abbr         earnings_med   count_not_working
##  Length:1473        Length:1473        Min.   :12600   Min.   :   0     
##  Class :character   Class :character   1st Qu.:28300   1st Qu.:  98     
##  Mode  :character   Mode  :character   Median :32300   Median : 215     
##                                        Mean   :33989   Mean   : 327     
##                                        3rd Qu.:37500   3rd Qu.: 433     
##                                        Max.   :92500   Max.   :4754     
##                                        NA's   :396     NA's   :428      
##  count_working   percent_working 
##  Min.   :   26   Min.   : 47.00  
##  1st Qu.:  399   1st Qu.: 80.00  
##  Median : 1025   Median : 83.00  
##  Mean   : 1523   Mean   : 82.82  
##  3rd Qu.: 2086   3rd Qu.: 86.00  
##  Max.   :17042   Max.   :100.00  
##  NA's   :388     NA's   :428

cat("\n\n")

print("Summary Statistics for Two Year College Graduates")

## [1] "Summary Statistics for Two Year College Graduates"

summary(mid_atlantic_4)

##   inst_name          state_abbr         earnings_med    count_not_working
##  Length:2579        Length:2579        Min.   : 11800   Min.   :    1.0  
##  Class :character   Class :character   1st Qu.: 39500   1st Qu.:   49.0  
##  Mode  :character   Mode  :character   Median : 45900   Median :  103.0  
##                                        Mean   : 46906   Mean   :  395.8  
##                                        3rd Qu.: 53000   3rd Qu.:  256.0  
##                                        Max.   :128000   Max.   :15960.0  
##                                        NA's   :834      NA's   :842      
##  count_working   percent_working
##  Min.   :    8   Min.   :22.00  
##  1st Qu.:  431   1st Qu.:86.00  
##  Median :  882   Median :90.00  
##  Mean   : 2831   Mean   :87.65  
##  3rd Qu.: 2101   3rd Qu.:92.00  
##  Max.   :94724   Max.   :98.00  
##  NA's   :799     NA's   :843

# 3. Graphics: Please make sure to display at least one scatter plot, box plot and histogram. 
# Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.


# histograms
ggplot(data=mid_atlantic_2) + geom_histogram(aes(x=earnings_med),bins=100, na.rm = TRUE) + scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(0,120)) + labs(title = "Earnings Histogram for Two Year Degrees", x = "Earnings in Dollars", y = "Count") +   theme(plot.title = element_text(hjust = 0.5))

ggplot(data=mid_atlantic_4) + geom_histogram(aes(x=earnings_med),bins=100, na.rm = TRUE)+ scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(0,120)) + labs(title = "Earnings Histogram for Four Year Degrees", x = "Earnings in Dollars", y = "Count") +   theme(plot.title = element_text(hjust = 0.5))

# scatter plots
ggplot(mid_atlantic_2, aes(x=earnings_med, y= count_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE)   + scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + labs(title = "Earnings for Two Year Degrees", x = "Earnings in Dollars", y = "Working Graduates") + theme(plot.title = element_text(hjust = 0.5)) + scale_y_log10()

ggplot(mid_atlantic_4, aes(x=earnings_med, y= count_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE) + scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + labs(title = "Earnings for Four Year Degrees", x = "Earnings in Dollars", y = "Working Graduates") +   theme(plot.title = element_text(hjust = 0.5)) + scale_y_log10()

# scatter plots #2
ggplot(mid_atlantic_2, aes(x=earnings_med, y= percent_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE)+ scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(40,100)) + labs(title = "Earnings for Two Year Degrees", x = "Earnings in Dollars", y = "Percent of Working Graduates") +   theme(plot.title = element_text(hjust = 0.5))

ggplot(mid_atlantic_4, aes(x=earnings_med, y= percent_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE)+ scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(40,100))+ labs(title = "Earnings for Four Year Degrees", x = "Earnings in Dollars", y = "Percent of Working Graduates") +   theme(plot.title = element_text(hjust = 0.5))

# box plots
ggplot(mid_atlantic_2, aes(y=earnings_med, x=state_abbr)) + geom_boxplot( na.rm = TRUE) + labs(title = "Median Earnings for Two Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) +   theme(plot.title = element_text(hjust = 0.5))

ggplot(mid_atlantic_4, aes(y=earnings_med, x=state_abbr)) + geom_boxplot( na.rm = TRUE) + labs(title = "Median Earnings for Four Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) +   theme(plot.title = element_text(hjust = 0.5))

# violin
ggplot(mid_atlantic_2, aes(y=earnings_med, x=state_abbr)) + geom_violin(na.rm = TRUE) + labs(title = "Median Earnings for Two Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) +   theme(plot.title = element_text(hjust = 0.5))

ggplot(mid_atlantic_4, aes(y=earnings_med, x=state_abbr)) + geom_violin(na.rm = TRUE) + labs(title = "Median Earnings for Four Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) +   theme(plot.title = element_text(hjust = 0.5))

R - Final Project

Cliff Lee

8/2/2021

Do Four Year Graduates From Mid Atlantic Colleges Earn 10% or More Than Two Year Graduates?

RPub link : https://rpubs.com/cliftonleesps/r_week3_final

Conclusion

The answer to the initial question is yes: four year degree graduates do earn more than 10% than two year degree graduates. They typically earn 42% more, so if a student has a choice, a four year degree will pay for itself quickly in higher earnings.