We used the data from https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv
I propose the following 10 questions based on my own understanding of the data:
We will explore the questions in detail.
college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
## Name State ID Main
## 1 Alabama A & M University AL 100654 1
## 2 University of Alabama at Birmingham AL 100663 1
## 3 Amridge University AL 100690 1
## 4 University of Alabama in Huntsville AL 100706 1
## 5 Alabama State University AL 100724 1
## 6 The University of Alabama AL 100751 1
## Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
## MainDegree HighDegree Control Region Locale Latitude Longitude AdmitRate
## 1 3 4 Public Southeast City 34.78337 -86.56850 0.9027
## 2 3 4 Public Southeast City 33.50570 -86.79935 0.9181
## 3 3 4 Private Southeast City 32.36261 -86.17401 NA
## 4 3 4 Public Southeast City 34.72456 -86.64045 0.8123
## 5 3 4 Public Southeast City 32.36432 -86.29568 0.9787
## 6 3 4 Public Southeast City 33.21187 -87.54598 0.5330
## MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1 18 929 0 4824 2.5 90.7 0.9 0.2 5.6 6.6
## 2 25 1195 0 12866 57.8 25.9 3.3 5.9 7.1 25.2
## 3 NA NA 1 322 7.1 14.3 0.6 0.3 77.6 54.4
## 4 28 1322 0 6917 74.2 10.7 4.6 4.0 6.5 15.0
## 5 18 935 0 4189 1.5 93.8 1.0 0.3 3.5 7.7
## 6 28 1278 0 32387 78.5 10.1 4.7 1.2 5.6 7.9
## NetPrice Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1 15184 22886 9857 18236 9227 7298 6983
## 2 17535 24129 8328 19032 11612 17235 10640
## 3 9649 15080 6900 6900 14738 5265 3866
## 4 19986 22108 10280 21480 8727 9748 9391
## 5 12874 19413 11068 19396 9003 7983 7399
## 6 21973 28836 10780 28100 13574 10894 10016
## FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1 71.3 71.0 23.96 1068 56.4 36.6 23.6
## 2 89.9 35.3 52.92 3755 63.9 34.1 34.5
## 3 100.0 74.2 18.18 109 64.9 51.3 15.0
## 4 64.6 27.7 48.62 1347 47.6 31.0 44.8
## 5 54.2 73.8 27.69 1294 61.3 34.3 22.1
## 6 74.0 18.0 67.87 6430 61.5 22.6 66.7
states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
lowest_state <- states[which.min(states$AdmitRate), ]
print(lowest_state)
## State AdmitRate
## 8 DC 0.5162143
barplot(states$AdmitRate[1:10], names.arg=states$State[1:10], las=2, col="lightblue",
main="States with Lowest Admission Rates", ylab="Admission Rate")
states <- aggregate(AdmitRate ~ State, college, mean, na.rm = TRUE)
highest_state <- states[which.max(states$AdmitRate), ]
print(highest_state)
## State AdmitRate
## 12 GU 1
barplot(states$AdmitRate[41:50], names.arg=states$State[41:50], las=2, col="lightgreen",
main="States with Highest Admission Rates", ylab="Admission Rate")
faculty_salary_by_type <- aggregate(FacSalary ~ Control, college, mean, na.rm = TRUE)
highest_salary_type <- faculty_salary_by_type[which.max(faculty_salary_by_type$FacSalary), ]
print(faculty_salary_by_type)
## Control FacSalary
## 1 Private 7090.578
## 2 Profit 6233.672
## 3 Public 8520.173
print(highest_salary_type)
## Control FacSalary
## 3 Public 8520.173
boxplot(FacSalary ~ Control, data=college, main="Faculty Salaries by Institution Type",
xlab="Institution Type", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink"))
average_student_debt <- mean(college$Debt, na.rm = TRUE)
print(average_student_debt)
## [1] 2365.655
hist(college$Debt, main="Distribution of Student Debt", xlab="Average Student Debt",
col="lightblue", breaks=20)
instruct_fte_by_type <- aggregate(InstructFTE ~ Control, college, mean, na.rm = TRUE)
print(instruct_fte_by_type)
## Control InstructFTE
## 1 Private 11340.878
## 2 Profit 4891.524
## 3 Public 11573.742
boxplot(InstructFTE ~ Control, data=college, main="Instructional Spending per FTE by Institution Type",
xlab="Institution Type", ylab="Instructional Spending", col=c("lightblue", "lightgreen", "pink"))
avg_out_state_tuition <- mean(college$TuitonOut, na.rm = TRUE)
print(avg_out_state_tuition)
## [1] 25336.66
boxplot(TuitonOut ~ Region, data=college, main="Out-of-State Tuition Across Regions",
xlab="Region", ylab="Tuition Cost", col="lightgreen")
first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
top_states_first_gen <- first_gen_by_state[order(-first_gen_by_state$FirstGen), ]
print(head(top_states_first_gen, 5))
## State FirstGen
## 49 VI 46.60000
## 34 NM 43.89091
## 35 NV 43.39091
## 12 GU 42.55000
## 53 WV 41.85263
barplot(top_states_first_gen$FirstGen[1:10], names.arg=top_states_first_gen$State[1:10], las=2, col="lightblue",
main="Top 10 States with Highest % of First-Generation Students", ylab="Percentage")
first_gen_by_state <- aggregate(FirstGen ~ State, college, mean, na.rm = TRUE)
bottom_states_first_gen <- first_gen_by_state[order(first_gen_by_state$FirstGen), ]
print(head(bottom_states_first_gen, 5))
## State FirstGen
## 42 RI 23.68000
## 25 MN 23.80000
## 30 ND 26.22222
## 50 VT 26.32727
## 14 IA 27.04828
barplot(bottom_states_first_gen$FirstGen[1:10], names.arg=bottom_states_first_gen$State[1:10], las=2, col="lightgreen",
main="Top 10 States with Lowest % of First-Generation Students", ylab="Percentage")
midwest_schools <- subset(college, Region == "Midwest")
mean_midwest_faculty_salary <- mean(midwest_schools$FacSalary, na.rm = TRUE)
print(mean_midwest_faculty_salary)
## [1] 7113.745
boxplot(FacSalary ~ Region, data=college, main="Faculty Salaries by Region",
xlab="Region", ylab="Faculty Salary", col=c("lightblue", "lightgreen", "pink", "orange", "purple"))
correlation <- cor(college$Online, college$CompRate, use = "complete.obs")
print(paste("Correlation between online-only colleges and completion rates:", round(correlation, 2)))
## [1] "Correlation between online-only colleges and completion rates: -0.09"
boxplot(CompRate ~ Online, data = college,
main = "Completion Rates: Online-Only vs Non-Online Colleges",
xlab = "Online-Only College (0 = No, 1 = Yes)",
ylab = "Completion Rate (%)",
col = c("lightblue", "lightgreen"),
names = c("Non-Online", "Online"),
notch = FALSE)
Question-by-Question summary:
For question 1, looking at the Barplot, we can see that Washington DC, Delaware, and California have some of the lowest admission rates. California is highly competitive overall, with schools like UC Berkeley, UCLA, and more. Washington DC is also competitive, but also has less population density. Delaware doesn’t have as many competitive schools, but has a lower population density.
Looking at the Barplot for question 2, we can see that states like South Dakota, Virginia, and Utah have some of the highest admission rates. This may be due to the less competitive and prestigious schools, as well as lower population density.