1. Introduction

We used the data from https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv

I propose the following 10 questions based on my own understanding of the data:

Analysis

We will explore the questions in detail.

college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
##                                  Name State     ID Main
## 1            Alabama A & M University    AL 100654    1
## 2 University of Alabama at Birmingham    AL 100663    1
## 3                  Amridge University    AL 100690    1
## 4 University of Alabama in Huntsville    AL 100706    1
## 5            Alabama State University    AL 100724    1
## 6           The University of Alabama    AL 100751    1
##                                                                Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
##   MainDegree HighDegree Control    Region Locale Latitude Longitude AdmitRate
## 1          3          4  Public Southeast   City 34.78337 -86.56850    0.9027
## 2          3          4  Public Southeast   City 33.50570 -86.79935    0.9181
## 3          3          4 Private Southeast   City 32.36261 -86.17401        NA
## 4          3          4  Public Southeast   City 34.72456 -86.64045    0.8123
## 5          3          4  Public Southeast   City 32.36432 -86.29568    0.9787
## 6          3          4  Public Southeast   City 33.21187 -87.54598    0.5330
##   MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1     18    929      0       4824   2.5  90.7      0.9   0.2   5.6      6.6
## 2     25   1195      0      12866  57.8  25.9      3.3   5.9   7.1     25.2
## 3     NA     NA      1        322   7.1  14.3      0.6   0.3  77.6     54.4
## 4     28   1322      0       6917  74.2  10.7      4.6   4.0   6.5     15.0
## 5     18    935      0       4189   1.5  93.8      1.0   0.3   3.5      7.7
## 6     28   1278      0      32387  78.5  10.1      4.7   1.2   5.6      7.9
##   NetPrice  Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1    15184 22886      9857     18236       9227        7298      6983
## 2    17535 24129      8328     19032      11612       17235     10640
## 3     9649 15080      6900      6900      14738        5265      3866
## 4    19986 22108     10280     21480       8727        9748      9391
## 5    12874 19413     11068     19396       9003        7983      7399
## 6    21973 28836     10780     28100      13574       10894     10016
##   FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1        71.3 71.0    23.96 1068   56.4     36.6      23.6
## 2        89.9 35.3    52.92 3755   63.9     34.1      34.5
## 3       100.0 74.2    18.18  109   64.9     51.3      15.0
## 4        64.6 27.7    48.62 1347   47.6     31.0      44.8
## 5        54.2 73.8    27.69 1294   61.3     34.3      22.1
## 6        74.0 18.0    67.87 6430   61.5     22.6      66.7

Q1: What state has the average lowest admission rate?

states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
lowest_state <- states[which.min(states$AdmitRate), ]
print(lowest_state)
##   State AdmitRate
## 8    DC 0.5162143
barplot(states$AdmitRate[1:10], names.arg=states$State[1:10], las=2, col="lightblue",
        main="States with Lowest Admission Rates", ylab="Admission Rate")

Q2: What state has the average highest admission rate?

states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
highest_state <- states[which.max(states$AdmitRate), ]
print(highest_state)
##    State AdmitRate
## 12    GU         1
barplot(states$AdmitRate[41:50], names.arg=states$State[41:50], las=2, col="lightgreen",
        main="States with Highest Admission Rates", ylab="Admission Rate")

Q3: Which instituition type pays faculty the most, public, profit, or private?

faculty_salary_by_type <- aggregate(FacSalary ~ Control, college, mean, na.rm = TRUE)
highest_salary_type <- faculty_salary_by_type[which.max(faculty_salary_by_type$FacSalary), ]
print(faculty_salary_by_type)
##   Control FacSalary
## 1 Private  7090.578
## 2  Profit  6233.672
## 3  Public  8520.173
print(highest_salary_type)
##   Control FacSalary
## 3  Public  8520.173
boxplot(FacSalary ~ Control, data=college, main="Faculty Salaries by Institution Type",
        xlab="Institution Type", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink"))

Q4: What is the average overall student debt?

average_student_debt <- mean(college$Debt, na.rm = TRUE)
print(average_student_debt)
## [1] 2365.655
hist(college$Debt, main="Distribution of Student Debt", xlab="Average Student Debt",
     col="lightblue", breaks=20)

Q5: Do private schools have higher InstructFTE than public schools?

instruct_fte_by_type <- aggregate(InstructFTE ~ Control, college, mean, na.rm = TRUE)
print(instruct_fte_by_type)
##   Control InstructFTE
## 1 Private   11340.878
## 2  Profit    4891.524
## 3  Public   11573.742
boxplot(InstructFTE ~ Control, data=college, main="Instructional Spending per FTE by Institution Type",
        xlab="Institution Type", ylab="Instructional Spending", col=c("lightblue", "lightgreen", "pink"))

Q6: What is the average out of state tuition cost?

avg_out_state_tuition <- mean(college$TuitonOut, na.rm = TRUE)
print(avg_out_state_tuition)
## [1] 25336.66
boxplot(TuitonOut ~ Region, data=college, main="Out-of-State Tuition Across Regions",
        xlab="Region", ylab="Tuition Cost", col="lightgreen")

Q7: What states have the highest percentage of first generation students?

first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
top_states_first_gen <- first_gen_by_state[order(-first_gen_by_state$FirstGen), ]
print(head(top_states_first_gen, 5))
##    State FirstGen
## 49    VI 46.60000
## 34    NM 43.89091
## 35    NV 43.39091
## 12    GU 42.55000
## 53    WV 41.85263
barplot(top_states_first_gen$FirstGen[1:10], names.arg=top_states_first_gen$State[1:10], las=2, col="lightblue",
        main="Top 10 States with Highest % of First-Generation Students", ylab="Percentage")

Q8: What states have the lowest percentage of first generation students?

first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
bottom_states_first_gen <- first_gen_by_state[order(first_gen_by_state$FirstGen), ]
print(head(bottom_states_first_gen, 5))
##    State FirstGen
## 42    RI 23.68000
## 25    MN 23.80000
## 30    ND 26.22222
## 50    VT 26.32727
## 14    IA 27.04828
barplot(bottom_states_first_gen$FirstGen[1:10], names.arg=bottom_states_first_gen$State[1:10], las=2, col="lightgreen",
        main="Top 10 States with Lowest % of First-Generation Students", ylab="Percentage")

Q9: What is the mean faculty wage in the midwest?

midwest_schools <- subset(college, Region == "Midwest")
mean_midwest_faculty_salary <- mean(midwest_schools$FacSalary, na.rm = TRUE)
print(mean_midwest_faculty_salary)
## [1] 7113.745
boxplot(FacSalary ~ Region, data=college, main="Faculty Salaries by Region",
        xlab="Region", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink", "orange", "purple"))

Q10: What is the correlation of online-only colleges and completion rates?

correlation <- cor(college$Online, college$CompRate, use = "complete.obs")
print(paste("Correlation between online-only colleges and completion rates:", round(correlation, 2)))
## [1] "Correlation between online-only colleges and completion rates: -0.09"
boxplot(CompRate ~ Online, data = college, 
        main = "Completion Rates: Online-Only vs Non-Online Colleges",
        xlab = "Online-Only College (0 = No, 1 = Yes)", 
        ylab = "Completion Rate (%)",
        col = c("lightblue", "lightgreen"),
        names = c("Non-Online", "Online"),
        notch = FALSE)