1. Introduction

I propose the following questions for the CollegeScores4yr data set from the U.S. Department of Education.

2. Analysis

Here we will analyze the questions in further detail using R.

college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
##                                  Name State     ID Main
## 1            Alabama A & M University    AL 100654    1
## 2 University of Alabama at Birmingham    AL 100663    1
## 3                  Amridge University    AL 100690    1
## 4 University of Alabama in Huntsville    AL 100706    1
## 5            Alabama State University    AL 100724    1
## 6           The University of Alabama    AL 100751    1
##                                                                Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
##   MainDegree HighDegree Control    Region Locale Latitude Longitude AdmitRate
## 1          3          4  Public Southeast   City 34.78337 -86.56850    0.9027
## 2          3          4  Public Southeast   City 33.50570 -86.79935    0.9181
## 3          3          4 Private Southeast   City 32.36261 -86.17401        NA
## 4          3          4  Public Southeast   City 34.72456 -86.64045    0.8123
## 5          3          4  Public Southeast   City 32.36432 -86.29568    0.9787
## 6          3          4  Public Southeast   City 33.21187 -87.54598    0.5330
##   MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1     18    929      0       4824   2.5  90.7      0.9   0.2   5.6      6.6
## 2     25   1195      0      12866  57.8  25.9      3.3   5.9   7.1     25.2
## 3     NA     NA      1        322   7.1  14.3      0.6   0.3  77.6     54.4
## 4     28   1322      0       6917  74.2  10.7      4.6   4.0   6.5     15.0
## 5     18    935      0       4189   1.5  93.8      1.0   0.3   3.5      7.7
## 6     28   1278      0      32387  78.5  10.1      4.7   1.2   5.6      7.9
##   NetPrice  Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1    15184 22886      9857     18236       9227        7298      6983
## 2    17535 24129      8328     19032      11612       17235     10640
## 3     9649 15080      6900      6900      14738        5265      3866
## 4    19986 22108     10280     21480       8727        9748      9391
## 5    12874 19413     11068     19396       9003        7983      7399
## 6    21973 28836     10780     28100      13574       10894     10016
##   FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1        71.3 71.0    23.96 1068   56.4     36.6      23.6
## 2        89.9 35.3    52.92 3755   63.9     34.1      34.5
## 3       100.0 74.2    18.18  109   64.9     51.3      15.0
## 4        64.6 27.7    48.62 1347   47.6     31.0      44.8
## 5        54.2 73.8    27.69 1294   61.3     34.3      22.1
## 6        74.0 18.0    67.87 6430   61.5     22.6      66.7

Q1: What is the mean and standard deviation of the average cost across all colleges?

mean(college$Cost, na.rm = TRUE)
## [1] 34277.31
sd(college$Cost, na.rm = TRUE)
## [1] 15278.54

Q2: What is the median and range of average debt among students who complete their programs?

median(college$Debt, na.rm = TRUE)
## [1] 713.5
range(college$Debt, na.rm = TRUE)
## [1]    10 48216

Q3: What is the distribution of completion rates across colleges?

hist(college$CompRate, main = "Distribution of Completion Rates", xlab = "Completion Rate (%)", col = "lightblue")

Q4: How does the average faculty salary vary by type of control?

boxplot(FacSalary ~ Control, data = college, main = "Faculty Salary by Control Type", ylab = "Average Monthly Salary", col = "lightgreen")

Q5: What is the correlation between average SAT scores and admission rate?

cor(college$AvgSAT, college$AdmitRate, use = "complete.obs")
## [1] -0.4221255
plot(college$AdmitRate, college$AvgSAT, main = "Admission Rate vs. SAT", xlab = "Admission Rate", ylab = "Average SAT")

Q6: What is the mean and variance of in-state tuition across colleges?

mean(college$TuitionIn, na.rm = TRUE)
## [1] 21948.55
var(college$TuitionIn, na.rm = TRUE)
## [1] 199665280

Q7: What is the distribution of percent of female students among all colleges?

hist(college$Female, main = "Distribution of Female Students", xlab = "Percent Female", col = "pink")

Q8: How does the median net price differ among regions?

boxplot(NetPrice ~ Region, data = college, main = "Net Price by Region", ylab = "Average Net Price ($)", col = "orange")

Q9: What is the correlation between median family income and average debt?

cor(college$MedIncome, college$Debt, use = "complete.obs")
## [1] -0.1207221
plot(college$MedIncome, college$Debt, main = "Median Family Income vs. Student Debt", xlab = "Median Family Income ($1000s)", ylab = "Average Debt ($)")

Q10: What is the distribution of percent of part-time students across colleges?

hist(college$PartTime, main = "Distribution of Part-Time Students", xlab = "Percent Part-Time", col = "lightgray")

stem(college$PartTime)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    0 | 00000000000000000000000000000000000000000000000000000000000000000000+926
##    1 | 00000000000000000000000000000000000000000000000000000000000000000000+367
##    2 | 00000000000000000000000000011111111111111111111111111111111111111222+138
##    3 | 00000000000001111111112222222222222333333333333334444444444444555556+27
##    4 | 000011111111112222223333333444445555566677777778888888999999999
##    5 | 0000000011111111222223333344444455555666666777777788889999
##    6 | 00000111113334445566777777888889999
##    7 | 01111222333455677778889
##    8 | 00011122233444445677888999
##    9 | 000112222334444557899
##   10 | 0000000

3. Summary

1. College Cost

The mean total cost across colleges is about $34,277 with a large standard deviation (~$15,279). This indicates wide variation in college expenses, reflecting differences in tuition, housing, and regional costs.

2. Student Debt

The median student debt is approximately $714, but the range extends from $10 to over $48,000. This extreme range suggests that while some institutions leave students with minimal debt, others produce very high debt loads.

3. Completion Rates

The histogram shows that most colleges have completion rates between 40% and 60%, with the highest frequency near the midpoint (around 350 schools). This suggests that many institutions achieve moderate graduation success, but few reach either very low or very high completion percentages.

4. Faculty Salaries by Control Type

The boxplot reveals that public colleges have a higher median faculty salary, while private institutions show greater variation with both higher and lower outliers. For-profit schools tend to cluster at lower salary levels, indicating differing resource levels and pay structures across control types.

5. SAT and Admission Rate

The correlation between average SAT and admission rate is -0.42, indicating a moderate negative relationship. Colleges with higher average SAT scores tend to have lower admission rates, consistent with greater selectivity.

6. In-State Tuition

The average in-state tuition is around $21,949 with very high variance (~199 million). This shows that tuition costs differ dramatically among states and types of institutions, even among public colleges.

7. Female Enrollment

The histogram shows that female enrollment peaks just below 60%, with a frequency of roughly 700 schools in that range. This indicates that most colleges enroll slightly more women than men, consistent with national higher education trends.

8. Net Price by Region

The boxplot indicates that colleges in the Northeast and Midwest have the highest median net prices, while those in Territories show the lowest. The Northeast and West also exhibit greater variability in net prices, suggesting larger disparities in institutional affordability, with the Midwest showing several high-cost outliers.

9. Family Income and Student Debt

The correlation between median family income and average debt is -0.12, a weak negative relationship. This implies that students from wealthier families tend to incur slightly less debt, though the relationship is not strong. Most institutions cluster around moderate debt levels, with fewer cases at either extreme.

10. Part-Time Students

The histogram and stem-and-leaf display show that most colleges have very low percentages of part-time students, with the highest frequency between 0-10%. The number of colleges declines steadily as part-time enrollment increases, suggesting that most institutions primarily serve full-time students, while a smaller subset caters to flexible or nontraditional learners.

4. Appendix: R Code

Below is the complete R code used in this analysis.

# Load the data
college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)

# Q1: Mean and SD of Cost
mean(college$Cost, na.rm = TRUE)
sd(college$Cost, na.rm = TRUE)

# Q2: Median and Range of Debt
median(college$Debt, na.rm = TRUE)
range(college$Debt, na.rm = TRUE)

# Q3: Histogram of Completion Rate
hist(college$CompRate, main = "Distribution of Completion Rates", xlab = "Completion Rate (%)", col = "lightblue")

# Q4: Faculty Salary by Control
boxplot(FacSalary ~ Control, data = college, main = "Faculty Salary by Control Type", ylab = "Average Monthly Salary", col = "lightgreen")

# Q5: SAT vs. Admission Rate
cor(college$AvgSAT, college$AdmitRate, use = "complete.obs")
plot(college$AdmitRate, college$AvgSAT, main = "Admission Rate vs. SAT", xlab = "Admission Rate", ylab = "Average SAT")

# Q6: Mean and Variance of In-State Tuition
mean(college$TuitionIn, na.rm = TRUE)
var(college$TuitionIn, na.rm = TRUE)

# Q7: Histogram of Female Percentage
hist(college$Female, main = "Distribution of Female Students", xlab = "Percent Female", col = "pink")

# Q8: Net Price by Region
boxplot(NetPrice ~ Region, data = college, main = "Net Price by Region", ylab = "Average Net Price ($)", col = "orange")

# Q9: Family Income vs. Debt
cor(college$MedIncome, college$Debt, use = "complete.obs")
plot(college$MedIncome, college$Debt, main = "Median Family Income vs. Student Debt", xlab = "Median Family Income ($1000s)", ylab = "Average Debt ($)")

# Q10: Histogram and Stem of Part-Time Students
hist(college$PartTime, main = "Distribution of Part-Time Students", xlab = "Percent Part-Time", col = "lightgray")
stem(college$PartTime)