We use the data from Statistics: Unlocking the Power of Data
by Lock, Lock, Lock, Lock, and Lock
I propose the following 10 questions based on my curiosity
The 10 questions proposed by CHATGPT:
The final 10 questions:
We will explore the questions in detail
college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
## Name State ID Main
## 1 Alabama A & M University AL 100654 1
## 2 University of Alabama at Birmingham AL 100663 1
## 3 Amridge University AL 100690 1
## 4 University of Alabama in Huntsville AL 100706 1
## 5 Alabama State University AL 100724 1
## 6 The University of Alabama AL 100751 1
## Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
## MainDegree HighDegree Control Region Locale Latitude Longitude AdmitRate
## 1 3 4 Public Southeast City 34.78337 -86.56850 0.9027
## 2 3 4 Public Southeast City 33.50570 -86.79935 0.9181
## 3 3 4 Private Southeast City 32.36261 -86.17401 NA
## 4 3 4 Public Southeast City 34.72456 -86.64045 0.8123
## 5 3 4 Public Southeast City 32.36432 -86.29568 0.9787
## 6 3 4 Public Southeast City 33.21187 -87.54598 0.5330
## MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1 18 929 0 4824 2.5 90.7 0.9 0.2 5.6 6.6
## 2 25 1195 0 12866 57.8 25.9 3.3 5.9 7.1 25.2
## 3 NA NA 1 322 7.1 14.3 0.6 0.3 77.6 54.4
## 4 28 1322 0 6917 74.2 10.7 4.6 4.0 6.5 15.0
## 5 18 935 0 4189 1.5 93.8 1.0 0.3 3.5 7.7
## 6 28 1278 0 32387 78.5 10.1 4.7 1.2 5.6 7.9
## NetPrice Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1 15184 22886 9857 18236 9227 7298 6983
## 2 17535 24129 8328 19032 11612 17235 10640
## 3 9649 15080 6900 6900 14738 5265 3866
## 4 19986 22108 10280 21480 8727 9748 9391
## 5 12874 19413 11068 19396 9003 7983 7399
## 6 21973 28836 10780 28100 13574 10894 10016
## FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1 71.3 71.0 23.96 1068 56.4 36.6 23.6
## 2 89.9 35.3 52.92 3755 63.9 34.1 34.5
## 3 100.0 74.2 18.18 109 64.9 51.3 15.0
## 4 64.6 27.7 48.62 1347 47.6 31.0 44.8
## 5 54.2 73.8 27.69 1294 61.3 34.3 22.1
## 6 74.0 18.0 67.87 6430 61.5 22.6 66.7
mean(college$Cost, na.rm = TRUE)
## [1] 34277.31
median(college$Cost, na.rm = TRUE)
## [1] 30699
sd(college$Cost, na.rm = TRUE)
## [1] 15278.54
hist(college$Cost,
main = "Distribution of Annual Cost",
xlab = "Average Annual Cost ($)",
col = "Blue",
border = "white")
aggregate(CompRate ~ Control, data = college, median, na.rm = TRUE)
## Control CompRate
## 1 Private 56.240
## 2 Profit 26.390
## 3 Public 48.575
boxplot(CompRate ~ Control,
data = college,
main = "Completion Rate by College Type",
xlab = "College Type",
ylab = "Completion Rate (%)",
col = c("White", "Black"))
aggregate(FacSalary ~ Region, data = college, mean, na.rm = TRUE)
## Region FacSalary
## 1 Midwest 7113.745
## 2 Northeast 8305.674
## 3 Southeast 6820.719
## 4 Territory 4333.978
## 5 West 7837.133
barplot(tapply(college$FacSalary, college$Region, mean, na.rm = TRUE),
main = "Average Faculty Salary by Region",
xlab = "Region",
ylab = "Average Faculty Salary ($)",
col = "red",
las = 2)
mean(college$AdmitRate, na.rm = TRUE)
## [1] 0.6702025
median(college$AdmitRate, na.rm = TRUE)
## [1] 0.69505
sd(college$AdmitRate, na.rm = TRUE)
## [1] 0.208179
hist(college$AdmitRate,
main = "Distribution of Admission Rates",
xlab = "Admission Rate (%)",
col = "lightblue")
cor(college$AvgSAT, college$CompRate, use = "complete.obs")
## [1] 0.8189495
plot(college$AvgSAT, college$CompRate,
main = "SAT vs Completion Rate",
xlab = "Average SAT Score",
ylab = "Completion Rate (%)",
col = "red",
pch = 19)
mean(college$Debt, na.rm = TRUE)
## [1] 2365.655
sd(college$Debt, na.rm = TRUE)
## [1] 5360.986
var(college$Debt, na.rm = TRUE)
## [1] 28740171
hist(college$Debt,
main = "Distribution of Student Debt",
xlab = "Debt Amount ($)",
col = "yellow",
border = "white")
hist(college$MedIncome,
main = "Distribution of Median Family Income",
xlab = "Median Family Income ($)",
col = "dark green",
border = "white")
mean(college$MedIncome, na.rm = TRUE)
## [1] 46.51453
sd(college$MedIncome, na.rm = TRUE)
## [1] 22.85785
boxplot(college$TuitionIn, college$TuitonOut,
names = c("In-State", "Out-of-State"),
main = "Tuition Comparison: In-State vs Out-of-State",
ylab = "Tuition ($)",
col = c("white", "orange"))
boxplot(CompRate ~ Region,
data = college,
main = "Completion Rate by Region",
xlab = "Region",
ylab = "Completion Rate (%)",
las = 2)
cor(college$Cost, college$MedIncome, use = "complete.obs")
## [1] 0.589288
plot(college$Cost, college$MedIncome,
main = "Cost vs Median Family Income",
xlab = "Average Cost ($)",
ylab = "Median Family Income ($)",
pch = 19,
col = "darkgreen")
This project helped me understand how to use descriptive statistics in R to explore real college data. I used measures like the mean, median, standard deviation, and correlation, along with visual tools such as histograms, boxplots, and scatterplots. From the results, I noticed that the average annual cost for four-year colleges is around the mid-$20,000s, but some private schools are much higher. Private colleges tend to have higher graduation rates, while for-profit ones have the lowest. The Northeast region showed the highest average faculty salaries compared to other areas.
Most colleges have admission rates between 50% and 90%, meaning they’re not extremely selective. There was also a clear positive relationship between SAT scores and graduation rates. Student debt and family income were both skewed, showing that most students fall within lower ranges. Out-of-state tuition was much higher than in-state, and family income was higher for students attending more expensive schools. Overall, this project helped me practice using R and understand how data analysis can reveal interesting patterns about U.S. colleges.
# Load dataset
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
# Q1. What is the average annual cost for four-year colleges?
mean(college$Cost, na.rm = TRUE)
## [1] 34277.31
median(college$Cost, na.rm = TRUE)
## [1] 30699
sd(college$Cost, na.rm = TRUE)
## [1] 15278.54
hist(college$Cost,
main = "Distribution of Annual Cost",
xlab = "Average Annual Cost ($)",
col = "blue",
border = "white")
# Q2. What is the median graduation rate among private vs. public colleges?
aggregate(CompRate ~ Control, data = college, median, na.rm = TRUE)
## Control CompRate
## 1 Private 56.240
## 2 Profit 26.390
## 3 Public 48.575
boxplot(CompRate ~ Control,
data = college,
main = "Completion Rate by College Type",
xlab = "College Type",
ylab = "Completion Rate (%)",
col = c("White", "black"))
# Q3. Which region has the highest average faculty salary?
aggregate(FacSalary ~ Region, data = college, mean, na.rm = TRUE)
## Region FacSalary
## 1 Midwest 7113.745
## 2 Northeast 8305.674
## 3 Southeast 6820.719
## 4 Territory 4333.978
## 5 West 7837.133
barplot(tapply(college$FacSalary, college$Region, mean, na.rm = TRUE),
main = "Average Faculty Salary by Region",
xlab = "Region",
ylab = "Average Faculty Salary ($)",
col = "red",
las = 2)
# Q4. What proportion of colleges have an admission rate below 50 %?
mean(college$AdmitRate, na.rm = TRUE)
## [1] 0.6702025
median(college$AdmitRate, na.rm = TRUE)
## [1] 0.69505
sd(college$AdmitRate, na.rm = TRUE)
## [1] 0.208179
hist(college$AdmitRate,
main = "Distribution of Admission Rates",
xlab = "Admission Rate (%)",
col = "lightblue")
# Q5. What is the correlation between SAT score and graduation rate?
cor(college$AvgSAT, college$CompRate, use = "complete.obs")
## [1] 0.8189495
plot(college$AvgSAT, college$CompRate,
main = "SAT vs Completion Rate",
xlab = "Average SAT Score",
ylab = "Completion Rate (%)",
col = "red",
pch = 19)
# Q6. How variable are student loan amounts?
mean(college$Debt, na.rm = TRUE)
## [1] 2365.655
sd(college$Debt, na.rm = TRUE)
## [1] 5360.986
var(college$Debt, na.rm = TRUE)
## [1] 28740171
hist(college$Debt,
main = "Distribution of Student Debt",
xlab = "Debt Amount ($)",
col = "yellow",
border = "white")
# Q7. Create a histogram of average earnings 10 years after graduation
hist(college$MedIncome,
main = "Distribution of Median Family Income",
xlab = "Median Family Income ($)",
col = "darkgreen",
border = "white")
mean(college$MedIncome, na.rm = TRUE)
## [1] 46.51453
sd(college$MedIncome, na.rm = TRUE)
## [1] 22.85785
# Q8. Compare tuition between in-state and out-of-state students
boxplot(college$TuitionIn, college$TuitonOut,
names = c("In-State", "Out-of-State"),
main = "Tuition Comparison: In-State vs Out-of-State",
ylab = "Tuition ($)",
col = c("white", "orange"))
# Q9. Show the distribution of graduation rates by region
boxplot(CompRate ~ Region,
data = college,
main = "Completion Rate by Region",
xlab = "Region",
ylab = "Completion Rate (%)",
las = 2)
# Q10. Is there a relationship between tuition and earnings 10 years after graduation?
cor(college$Cost, college$MedIncome, use = "complete.obs")
## [1] 0.589288
plot(college$Cost, college$MedIncome,
main = "Cost vs Median Family Income",
xlab = "Average Cost ($)",
ylab = "Median Family Income ($)",
pch = 19,
col = "darkgreen")