# 5. BONUS – place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.
data_frame = read.table(file="https://raw.githubusercontent.com/cliftonleesps/dataset/main/scorecard.csv", header=TRUE,sep=",")
# Remove a few unecessary columns
data_frame <- subset(data_frame, select = c(inst_name, state_abbr, pred_degree_awarded_ipeds, earnings_med, count_not_working, count_working))
# 2. Data wrangling: Please perform some basic transformations. They will need to make sense but could include column renaming, creating a subset of the data, replacing values, or creating new columns with derived data (for example – if it makes sense you could sum two columns together)
# create two subsets for two and four year colleges in the mid Atlantic states
mid_atlantic_2 <- subset(data_frame, str_detect(state_abbr, "(NY|NJ|DE|MD|DC|VA|WV)") & pred_degree_awarded_ipeds == 2 )
mid_atlantic_4 <- subset(data_frame, str_detect(state_abbr, "(NY|NJ|DE|MD|DC|VA|WV)") & pred_degree_awarded_ipeds == 3 )
# remove the ped_degree_adwarded_ipeds
mid_atlantic_2 <- subset(mid_atlantic_2, select = c(inst_name, state_abbr, earnings_med, count_not_working, count_working))
mid_atlantic_4 <- subset(mid_atlantic_4, select = c(inst_name, state_abbr, earnings_med, count_not_working, count_working))
# add a new column for percent working from count_working and count_not_working
mid_atlantic_2$percent_working <- round(mid_atlantic_2$count_working / (mid_atlantic_2$count_not_working + mid_atlantic_2$count_working) * 100)
mid_atlantic_4$percent_working <- round(mid_atlantic_4$count_working / (mid_atlantic_4$count_not_working + mid_atlantic_4$count_working) * 100)
# Replace some values with string replacements
# Replace Community College with 'CC'
mid_atlantic_2$inst_name <- str_replace(string=mid_atlantic_2$inst_name, pattern="Community College", replacement="CC")
mid_atlantic_4$inst_name <- str_replace(string=mid_atlantic_4$inst_name, pattern="Community College", replacement="CC")
# Replace College with 'Col.'
mid_atlantic_2$inst_name <- str_replace(string=mid_atlantic_2$inst_name, pattern="College", replacement="Col.")
mid_atlantic_4$inst_name <- str_replace(string=mid_atlantic_4$inst_name, pattern="College", replacement="Col.")
# Replace University with 'Uni.'
mid_atlantic_2$inst_name <- str_replace(string=mid_atlantic_2$inst_name, pattern="University", replacement="Uni.")
mid_atlantic_4$inst_name <- str_replace(string=mid_atlantic_4$inst_name, pattern="University", replacement="Uni.")
# 1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.
print("Summary Statistics for Two Year College Graduates")
## [1] "Summary Statistics for Two Year College Graduates"
summary(mid_atlantic_2)
## inst_name state_abbr earnings_med count_not_working
## Length:1473 Length:1473 Min. :12600 Min. : 0
## Class :character Class :character 1st Qu.:28300 1st Qu.: 98
## Mode :character Mode :character Median :32300 Median : 215
## Mean :33989 Mean : 327
## 3rd Qu.:37500 3rd Qu.: 433
## Max. :92500 Max. :4754
## NA's :396 NA's :428
## count_working percent_working
## Min. : 26 Min. : 47.00
## 1st Qu.: 399 1st Qu.: 80.00
## Median : 1025 Median : 83.00
## Mean : 1523 Mean : 82.82
## 3rd Qu.: 2086 3rd Qu.: 86.00
## Max. :17042 Max. :100.00
## NA's :388 NA's :428
cat("\n\n")
print("Summary Statistics for Two Year College Graduates")
## [1] "Summary Statistics for Two Year College Graduates"
summary(mid_atlantic_4)
## inst_name state_abbr earnings_med count_not_working
## Length:2579 Length:2579 Min. : 11800 Min. : 1.0
## Class :character Class :character 1st Qu.: 39500 1st Qu.: 49.0
## Mode :character Mode :character Median : 45900 Median : 103.0
## Mean : 46906 Mean : 395.8
## 3rd Qu.: 53000 3rd Qu.: 256.0
## Max. :128000 Max. :15960.0
## NA's :834 NA's :842
## count_working percent_working
## Min. : 8 Min. :22.00
## 1st Qu.: 431 1st Qu.:86.00
## Median : 882 Median :90.00
## Mean : 2831 Mean :87.65
## 3rd Qu.: 2101 3rd Qu.:92.00
## Max. :94724 Max. :98.00
## NA's :799 NA's :843
# 3. Graphics: Please make sure to display at least one scatter plot, box plot and histogram.
# Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.
# histograms
ggplot(data=mid_atlantic_2) + geom_histogram(aes(x=earnings_med),bins=100, na.rm = TRUE) + scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(0,120)) + labs(title = "Earnings Histogram for Two Year Degrees", x = "Earnings in Dollars", y = "Count") + theme(plot.title = element_text(hjust = 0.5))

ggplot(data=mid_atlantic_4) + geom_histogram(aes(x=earnings_med),bins=100, na.rm = TRUE)+ scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(0,120)) + labs(title = "Earnings Histogram for Four Year Degrees", x = "Earnings in Dollars", y = "Count") + theme(plot.title = element_text(hjust = 0.5))

# scatter plots
ggplot(mid_atlantic_2, aes(x=earnings_med, y= count_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE) + scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + labs(title = "Earnings for Two Year Degrees", x = "Earnings in Dollars", y = "Working Graduates") + theme(plot.title = element_text(hjust = 0.5)) + scale_y_log10()

ggplot(mid_atlantic_4, aes(x=earnings_med, y= count_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE) + scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + labs(title = "Earnings for Four Year Degrees", x = "Earnings in Dollars", y = "Working Graduates") + theme(plot.title = element_text(hjust = 0.5)) + scale_y_log10()

# scatter plots #2
ggplot(mid_atlantic_2, aes(x=earnings_med, y= percent_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE)+ scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(40,100)) + labs(title = "Earnings for Two Year Degrees", x = "Earnings in Dollars", y = "Percent of Working Graduates") + theme(plot.title = element_text(hjust = 0.5))

ggplot(mid_atlantic_4, aes(x=earnings_med, y= percent_working)) + geom_point(aes(color=state_abbr), na.rm = TRUE)+ scale_x_continuous(limits = c(10000, 130000),labels = scales::comma) + scale_y_continuous(limits=c(40,100))+ labs(title = "Earnings for Four Year Degrees", x = "Earnings in Dollars", y = "Percent of Working Graduates") + theme(plot.title = element_text(hjust = 0.5))

# box plots
ggplot(mid_atlantic_2, aes(y=earnings_med, x=state_abbr)) + geom_boxplot( na.rm = TRUE) + labs(title = "Median Earnings for Two Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) + theme(plot.title = element_text(hjust = 0.5))

ggplot(mid_atlantic_4, aes(y=earnings_med, x=state_abbr)) + geom_boxplot( na.rm = TRUE) + labs(title = "Median Earnings for Four Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) + theme(plot.title = element_text(hjust = 0.5))

# violin
ggplot(mid_atlantic_2, aes(y=earnings_med, x=state_abbr)) + geom_violin(na.rm = TRUE) + labs(title = "Median Earnings for Two Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) + theme(plot.title = element_text(hjust = 0.5))

ggplot(mid_atlantic_4, aes(y=earnings_med, x=state_abbr)) + geom_violin(na.rm = TRUE) + labs(title = "Median Earnings for Four Year Graduates", x = "State", y = "Earnings") + scale_y_continuous(limits=c(0,125000)) + theme(plot.title = element_text(hjust = 0.5))
