Title: “Exam 1” Author: “Joe Shepardson” Date: “3/25/2020” Output: The Best Exam 1

##1.1: Introduction that explains the purpose of the document.

#The purpose of this document is to take a deeper look into the college scorecard dataset and find #patterns and trends amongst the data for making insights.

#1.2: Provide a short explanation of the data used

#The data showed has collected information of various types over 7100+ universities #across the United States.

#1.3: Explain how your analysis will help an individual better understand #how institutions of post-secondary education in Ohio #compare to other schools regionally and nationally.

#My analysis will include condensed datasets, tables and visualizations that will be easily #readble and transferable to the average individual for making their own inferences about #schools regionally and nationally.

#2.1 All packages used are identified and loaded upfront so the reader #knows which are required to replicate the analysis. #Code for installing packages is included but should be commented out so it is not run by default.

Required Packages:

library("tidyverse")
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("stringr")
library("skimr")
library("dplyr")
library("DT")

#2.2 Messages and warnings resulting from loading the package(s) are suppressed.

#2.3 Explanation is provided regarding the purpose of each package (there are over 10,000 packages, don’t assume that I know why you loaded each package).

#Tidyverse - “tidyverse” collects some of the most versatile R packages: ggplot2, dplyr, tidyr, readr, purrr, and tibble.

#Dplyr - With dplyr , anything you can do to a local data frame you can also do to a remote database table.

#The stringr package provide a cohesive #set of functions designed to make working with strings as easy as possible.

#skimr is an alternative to summary() , quickly providing a broad overview of a data frame.

#DT helps run scavascript tables.

#3.1 Import the data set from: http://asayanalytics.com/scorecard

scorecard <- read_csv("https://myxavier-my.sharepoint.com/:x:/g/personal/shepardsonj_xavier_edu/EQvUaUP2_n5CnKLSaFOKBSkBz61cw97ZDEiyuXi_xMvwTg?download=1")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   INSTNM = col_character(),
##   CITY = col_character(),
##   STABBR = col_character(),
##   ZIP = col_character(),
##   CONTROL = col_character(),
##   HBCU = col_character(),
##   UGDS = col_character()
## )
## See spec(...) for full column specifications.

#3.2 Data Cleaning

#Number of missing values
sum(is.na(scorecard))
## [1] 49030
#Identifying missing values per column
colSums(is.na(scorecard))
##        ID    INSTNM      CITY    STABBR       ZIP   CONTROL    LOCALE  LATITUDE 
##         0         0         0         0         0         0       444       445 
## LONGITUDE      HBCU   MENONLY WOMENONLY  ADM_RATE   ACTCM25   ACTCM75  ACTCMMID 
##       445         0       444       444      5078      5823      5823      5823 
##   SAT_AVG      UGDS  COSTT4_A AVGFACSAL   PCTPELL  PCTFLOAN AGE_ENTRY    FEMALE 
##      5795         0      3531      2868       770       770       500      1429 
##   MARRIED DEPENDENT   VETERAN FIRST_GEN    FAMINC 
##      1392       921      4538      1247       500
#Reassigned NULL values to NA
scorecard[scorecard=='NULL'] <- NA

#Reassigned string values to appropriate integers to match dataset.
scorecard[scorecard=='Public'] <- 1
scorecard[scorecard=='Private nonprofit'] <- 2 
scorecard[scorecard=='For profit'] <- 3  
scorecard[scorecard=='Private for-profit'] <- 3

#Changing incvalid Locale resonse to NA.
scorecard[scorecard=='-3'] <- NA

#Dummy Variables:

OHIO_AVGFAMINC <- 54021

#Dummy Columnn for Family Incomes above or below Ohio median of $54021.
scorecard$INC <- ifelse(scorecard$FAMINC>OHIO_AVGFAMINC,1, 0)


#Dummy Column for determining College or University.
scorecard$university <- ifelse(str_detect(scorecard$INSTNM, "College"),"College", "University/Other")

#Dummy Column for colleges bordering Ohio.
scorecard$neighbors <- ifelse(scorecard$STABBR %in% c("KY", "WV", "IN", "MI","PA"), "neighbor", "stranger")

#Mean of ACT Scores

mean(scorecard$ACTCMMID, na.rm = TRUE)               
## [1] 23.44195
#Dummy Variable showing whether institution ACT scores were above or below average
scorecard$GoodACT <- ifelse(scorecard$ACTCMMID>mean(scorecard$ACTCMMID, na.rm = TRUE), 1, 0)

#Mean SAT scores
mean(scorecard$SAT_AVG, na.rm=TRUE)
## [1] 1131.774
#Dummy Variable showing whether institution SAT scores were above or below average.
scorecard$GoodSat <- ifelse(scorecard$SAT_AVG>mean(scorecard$SAT_AVG, na.rm=TRUE), 1, 0)


#Mean cost of attendance.
mean(scorecard$COSTT4_A, na.rm= TRUE)
## [1] 26337.07
#Dummy Variable showing whether college or university had above average cost of attendance.
scorecard$ATT <- ifelse(scorecard$COSTT4_A>mean(scorecard$COSTT4_A, na.rm= TRUE), 1, 0)

#3.3 Markdown Table

Column Variables Description
ID Unique ID for institution
INSTNM institution name
CITY City of insitution
STABBR State postcode
ZIP ZIPCODE
CONTROL Control of insitution 1=Public 2=Private nonprofit 3=Private for-profit
LOCALE 11-13=Cities 21-23=Suburbs 31-33=Towns 41-42=Rural
HBCU Historially Black College and University
ADM_RATE Admission rate
ACTCM25 25th percentile of ACT cumulative score
ACTCM75 75th percentile of ACT cumulative score
ACTCMMID Midpoint of ACT cumulative score
SAT_AVG Average SAT equivalent score of students
UGDS Enrollment of undergraduates
COSTT4_A Average cost of attendance (academic year institutions)
AVGFACSAL Average faculty salary
PCTPELL Percentage of undergraduates who receive a Pell Grant
PCTFLOAN Percent of all undergraduate students receiving a federal student loan
AGE_ENTRY Average age of entry
FEMALE Share of female students
MARRIED Share of married students
DEPENDENT Share of dependent students
VETERAN Share of veteran students
FIRST_GEN Share of first-generation students
FAMINC Average family income in real 2015 dollars

#3.4: Clean Datatable

clean_scorecard<- data.frame(scorecard)

datatable(clean_scorecard)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html