We used the data from https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv
I propose the following 10 questions based on my own understanding of the data:
We will explore the questions in detail.
college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
## Name State ID Main
## 1 Alabama A & M University AL 100654 1
## 2 University of Alabama at Birmingham AL 100663 1
## 3 Amridge University AL 100690 1
## 4 University of Alabama in Huntsville AL 100706 1
## 5 Alabama State University AL 100724 1
## 6 The University of Alabama AL 100751 1
## Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
## MainDegree HighDegree Control Region Locale Latitude Longitude AdmitRate
## 1 3 4 Public Southeast City 34.78337 -86.56850 0.9027
## 2 3 4 Public Southeast City 33.50570 -86.79935 0.9181
## 3 3 4 Private Southeast City 32.36261 -86.17401 NA
## 4 3 4 Public Southeast City 34.72456 -86.64045 0.8123
## 5 3 4 Public Southeast City 32.36432 -86.29568 0.9787
## 6 3 4 Public Southeast City 33.21187 -87.54598 0.5330
## MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1 18 929 0 4824 2.5 90.7 0.9 0.2 5.6 6.6
## 2 25 1195 0 12866 57.8 25.9 3.3 5.9 7.1 25.2
## 3 NA NA 1 322 7.1 14.3 0.6 0.3 77.6 54.4
## 4 28 1322 0 6917 74.2 10.7 4.6 4.0 6.5 15.0
## 5 18 935 0 4189 1.5 93.8 1.0 0.3 3.5 7.7
## 6 28 1278 0 32387 78.5 10.1 4.7 1.2 5.6 7.9
## NetPrice Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1 15184 22886 9857 18236 9227 7298 6983
## 2 17535 24129 8328 19032 11612 17235 10640
## 3 9649 15080 6900 6900 14738 5265 3866
## 4 19986 22108 10280 21480 8727 9748 9391
## 5 12874 19413 11068 19396 9003 7983 7399
## 6 21973 28836 10780 28100 13574 10894 10016
## FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1 71.3 71.0 23.96 1068 56.4 36.6 23.6
## 2 89.9 35.3 52.92 3755 63.9 34.1 34.5
## 3 100.0 74.2 18.18 109 64.9 51.3 15.0
## 4 64.6 27.7 48.62 1347 47.6 31.0 44.8
## 5 54.2 73.8 27.69 1294 61.3 34.3 22.1
## 6 74.0 18.0 67.87 6430 61.5 22.6 66.7
states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
lowest_state <- states[which.min(states$AdmitRate), ]
print(lowest_state)
## State AdmitRate
## 8 DC 0.5162143
barplot(states$AdmitRate[1:10], names.arg=states$State[1:10], las=2, col="lightblue",
main="States with Lowest Admission Rates", ylab="Admission Rate")
states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
highest_state <- states[which.max(states$AdmitRate), ]
print(highest_state)
## State AdmitRate
## 12 GU 1
barplot(states$AdmitRate[41:50], names.arg=states$State[41:50], las=2, col="lightgreen",
main="States with Highest Admission Rates", ylab="Admission Rate")
faculty_salary_by_type <- aggregate(FacSalary ~ Control, college, mean, na.rm = TRUE)
highest_salary_type <- faculty_salary_by_type[which.max(faculty_salary_by_type$FacSalary), ]
print(faculty_salary_by_type)
## Control FacSalary
## 1 Private 7090.578
## 2 Profit 6233.672
## 3 Public 8520.173
print(highest_salary_type)
## Control FacSalary
## 3 Public 8520.173
boxplot(FacSalary ~ Control, data=college, main="Faculty Salaries by Institution Type",
xlab="Institution Type", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink"))
average_student_debt <- mean(college$Debt, na.rm = TRUE)
print(average_student_debt)
## [1] 2365.655
hist(college$Debt, main="Distribution of Student Debt", xlab="Average Student Debt",
col="lightblue", breaks=20)
instruct_fte_by_type <- aggregate(InstructFTE ~ Control, college, mean, na.rm = TRUE)
print(instruct_fte_by_type)
## Control InstructFTE
## 1 Private 11340.878
## 2 Profit 4891.524
## 3 Public 11573.742
boxplot(InstructFTE ~ Control, data=college, main="Instructional Spending per FTE by Institution Type",
xlab="Institution Type", ylab="Instructional Spending", col=c("lightblue", "lightgreen", "pink"))
avg_out_state_tuition <- mean(college$TuitonOut, na.rm = TRUE)
print(avg_out_state_tuition)
## [1] 25336.66
boxplot(TuitonOut ~ Region, data=college, main="Out-of-State Tuition Across Regions",
xlab="Region", ylab="Tuition Cost", col="lightgreen")
first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
top_states_first_gen <- first_gen_by_state[order(-first_gen_by_state$FirstGen), ]
print(head(top_states_first_gen, 5))
## State FirstGen
## 49 VI 46.60000
## 34 NM 43.89091
## 35 NV 43.39091
## 12 GU 42.55000
## 53 WV 41.85263
barplot(top_states_first_gen$FirstGen[1:10], names.arg=top_states_first_gen$State[1:10], las=2, col="lightblue",
main="Top 10 States with Highest % of First-Generation Students", ylab="Percentage")
first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
bottom_states_first_gen <- first_gen_by_state[order(first_gen_by_state$FirstGen), ]
print(head(bottom_states_first_gen, 5))
## State FirstGen
## 42 RI 23.68000
## 25 MN 23.80000
## 30 ND 26.22222
## 50 VT 26.32727
## 14 IA 27.04828
barplot(bottom_states_first_gen$FirstGen[1:10], names.arg=bottom_states_first_gen$State[1:10], las=2, col="lightgreen",
main="Top 10 States with Lowest % of First-Generation Students", ylab="Percentage")
midwest_schools <- subset(college, Region == "Midwest")
mean_midwest_faculty_salary <- mean(midwest_schools$FacSalary, na.rm = TRUE)
print(mean_midwest_faculty_salary)
## [1] 7113.745
boxplot(FacSalary ~ Region, data=college, main="Faculty Salaries by Region",
xlab="Region", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink", "orange", "purple"))
correlation <- cor(college$Online, college$CompRate, use = "complete.obs")
print(paste("Correlation between online-only colleges and completion rates:", round(correlation, 2)))
## [1] "Correlation between online-only colleges and completion rates: -0.09"
boxplot(CompRate ~ Online, data = college,
main = "Completion Rates: Online-Only vs Non-Online Colleges",
xlab = "Online-Only College (0 = No, 1 = Yes)",
ylab = "Completion Rate (%)",
col = c("lightblue", "lightgreen"),
names = c("Non-Online", "Online"),
notch = FALSE)