1. Introduction

We used the data from https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv

I propose the following 10 questions based on my own understanding of the data:

Analysis

We will explore the questions in detail.

college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
##                                  Name State     ID Main
## 1            Alabama A & M University    AL 100654    1
## 2 University of Alabama at Birmingham    AL 100663    1
## 3                  Amridge University    AL 100690    1
## 4 University of Alabama in Huntsville    AL 100706    1
## 5            Alabama State University    AL 100724    1
## 6           The University of Alabama    AL 100751    1
##                                                                Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
##   MainDegree HighDegree Control    Region Locale Latitude Longitude AdmitRate
## 1          3          4  Public Southeast   City 34.78337 -86.56850    0.9027
## 2          3          4  Public Southeast   City 33.50570 -86.79935    0.9181
## 3          3          4 Private Southeast   City 32.36261 -86.17401        NA
## 4          3          4  Public Southeast   City 34.72456 -86.64045    0.8123
## 5          3          4  Public Southeast   City 32.36432 -86.29568    0.9787
## 6          3          4  Public Southeast   City 33.21187 -87.54598    0.5330
##   MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1     18    929      0       4824   2.5  90.7      0.9   0.2   5.6      6.6
## 2     25   1195      0      12866  57.8  25.9      3.3   5.9   7.1     25.2
## 3     NA     NA      1        322   7.1  14.3      0.6   0.3  77.6     54.4
## 4     28   1322      0       6917  74.2  10.7      4.6   4.0   6.5     15.0
## 5     18    935      0       4189   1.5  93.8      1.0   0.3   3.5      7.7
## 6     28   1278      0      32387  78.5  10.1      4.7   1.2   5.6      7.9
##   NetPrice  Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1    15184 22886      9857     18236       9227        7298      6983
## 2    17535 24129      8328     19032      11612       17235     10640
## 3     9649 15080      6900      6900      14738        5265      3866
## 4    19986 22108     10280     21480       8727        9748      9391
## 5    12874 19413     11068     19396       9003        7983      7399
## 6    21973 28836     10780     28100      13574       10894     10016
##   FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1        71.3 71.0    23.96 1068   56.4     36.6      23.6
## 2        89.9 35.3    52.92 3755   63.9     34.1      34.5
## 3       100.0 74.2    18.18  109   64.9     51.3      15.0
## 4        64.6 27.7    48.62 1347   47.6     31.0      44.8
## 5        54.2 73.8    27.69 1294   61.3     34.3      22.1
## 6        74.0 18.0    67.87 6430   61.5     22.6      66.7

Q1: What state has the average lowest admission rate?

states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
lowest_state <- states[which.min(states$AdmitRate), ]
print(lowest_state)
##   State AdmitRate
## 8    DC 0.5162143
barplot(states$AdmitRate[1:10], names.arg=states$State[1:10], las=2, col="lightblue",
        main="States with Lowest Admission Rates", ylab="Admission Rate")

Q2: What state has the average highest admission rate?

states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
highest_state <- states[which.max(states$AdmitRate), ]
print(highest_state)
##    State AdmitRate
## 12    GU         1
barplot(states$AdmitRate[41:50], names.arg=states$State[41:50], las=2, col="lightgreen",
        main="States with Highest Admission Rates", ylab="Admission Rate")

Q3: Which instituition type pays faculty the most, public, profit, or private?

faculty_salary_by_type <- aggregate(FacSalary ~ Control, college, mean, na.rm = TRUE)
highest_salary_type <- faculty_salary_by_type[which.max(faculty_salary_by_type$FacSalary), ]
print(faculty_salary_by_type)
##   Control FacSalary
## 1 Private  7090.578
## 2  Profit  6233.672
## 3  Public  8520.173
print(highest_salary_type)
##   Control FacSalary
## 3  Public  8520.173
boxplot(FacSalary ~ Control, data=college, main="Faculty Salaries by Institution Type",
        xlab="Institution Type", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink"))

Q4: What is the average overall student debt?

average_student_debt <- mean(college$Debt, na.rm = TRUE)
print(average_student_debt)
## [1] 2365.655
hist(college$Debt, main="Distribution of Student Debt", xlab="Average Student Debt",
     col="lightblue", breaks=20)

Q5: Do private schools have higher InstructFTE than public schools?

instruct_fte_by_type <- aggregate(InstructFTE ~ Control, college, mean, na.rm = TRUE)
print(instruct_fte_by_type)
##   Control InstructFTE
## 1 Private   11340.878
## 2  Profit    4891.524
## 3  Public   11573.742
boxplot(InstructFTE ~ Control, data=college, main="Instructional Spending per FTE by Institution Type",
        xlab="Institution Type", ylab="Instructional Spending", col=c("lightblue", "lightgreen", "pink"))

Q6: What is the average out of state tuition cost?

avg_out_state_tuition <- mean(college$TuitonOut, na.rm = TRUE)
print(avg_out_state_tuition)
## [1] 25336.66
boxplot(TuitonOut ~ Region, data=college, main="Out-of-State Tuition Across Regions",
        xlab="Region", ylab="Tuition Cost", col="lightgreen")

Q7: What states have the highest percentage of first generation students?

first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
top_states_first_gen <- first_gen_by_state[order(-first_gen_by_state$FirstGen), ]
print(head(top_states_first_gen, 5))
##    State FirstGen
## 49    VI 46.60000
## 34    NM 43.89091
## 35    NV 43.39091
## 12    GU 42.55000
## 53    WV 41.85263
barplot(top_states_first_gen$FirstGen[1:10], names.arg=top_states_first_gen$State[1:10], las=2, col="lightblue",
        main="Top 10 States with Highest % of First-Generation Students", ylab="Percentage")

Q8: What states have the lowest percentage of first generation students?

first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
bottom_states_first_gen <- first_gen_by_state[order(first_gen_by_state$FirstGen), ]
print(head(bottom_states_first_gen, 5))
##    State FirstGen
## 42    RI 23.68000
## 25    MN 23.80000
## 30    ND 26.22222
## 50    VT 26.32727
## 14    IA 27.04828
barplot(bottom_states_first_gen$FirstGen[1:10], names.arg=bottom_states_first_gen$State[1:10], las=2, col="lightgreen",
        main="Top 10 States with Lowest % of First-Generation Students", ylab="Percentage")

Q9: What is the mean faculty wage in the midwest?

midwest_schools <- subset(college, Region == "Midwest")
mean_midwest_faculty_salary <- mean(midwest_schools$FacSalary, na.rm = TRUE)
print(mean_midwest_faculty_salary)
## [1] 7113.745
boxplot(FacSalary ~ Region, data=college, main="Faculty Salaries by Region",
        xlab="Region", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink", "orange", "purple"))

Q10: What is the correlation of online-only colleges and completion rates?

correlation <- cor(college$Online, college$CompRate, use = "complete.obs")
print(paste("Correlation between online-only colleges and completion rates:", round(correlation, 2)))
## [1] "Correlation between online-only colleges and completion rates: -0.09"
boxplot(CompRate ~ Online, data = college, 
        main = "Completion Rates: Online-Only vs Non-Online Colleges",
        xlab = "Online-Only College (0 = No, 1 = Yes)", 
        ylab = "Completion Rate (%)",
        col = c("lightblue", "lightgreen"),
        names = c("Non-Online", "Online"),
        notch = FALSE)

Summary

Question-by-Question summary:

For question 1, looking at the Barplot, we can see that Washington DC, Delaware, and California have some of the lowest admission rates. California is highly competitive overall, with schools like UC Berkeley, UCLA, and more. Washington DC is also competitive, but also has less population density. Delaware doesn’t have as many competitive schools, but has a lower population density.

Looking at the Barplot for question 2, we can see that states like South Dakota, Virginia, and Utah have some of the highest admission rates. This may be due to the less competitive and prestigious schools, as well as lower population density.