Introduction

Using the data from lock5stat.com, I propose the the following 10 questions for this project.

  1. What is the mean faculty salary across all colleges?
  2. What is the median completion rate of colleges by region?
  3. How does the mean debt of students vary between public and private institutions?
  4. What is the variance in net price among colleges in different states?
  5. Is there a correlation between the average SAT score and the admission rate?
  6. What percentage of students are female across all colleges, and how does this compare between public and private institutions?
  7. What is the distribution of mid-ACT scores among all colleges?
  8. How does the median income of students’ families differ by control type (Public vs. Private)?
  9. What is the standard deviation of tuition fees for in-state students across all colleges?
  10. What is the mean percentage of first-generation students in each region?

Analysis

Here, we will explore these questions in detail.

college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
##                                  Name State     ID Main
## 1            Alabama A & M University    AL 100654    1
## 2 University of Alabama at Birmingham    AL 100663    1
## 3                  Amridge University    AL 100690    1
## 4 University of Alabama in Huntsville    AL 100706    1
## 5            Alabama State University    AL 100724    1
## 6           The University of Alabama    AL 100751    1
##                                                                Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
##   MainDegree HighDegree Control    Region Locale Latitude Longitude AdmitRate
## 1          3          4  Public Southeast   City 34.78337 -86.56850    0.9027
## 2          3          4  Public Southeast   City 33.50570 -86.79935    0.9181
## 3          3          4 Private Southeast   City 32.36261 -86.17401        NA
## 4          3          4  Public Southeast   City 34.72456 -86.64045    0.8123
## 5          3          4  Public Southeast   City 32.36432 -86.29568    0.9787
## 6          3          4  Public Southeast   City 33.21187 -87.54598    0.5330
##   MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1     18    929      0       4824   2.5  90.7      0.9   0.2   5.6      6.6
## 2     25   1195      0      12866  57.8  25.9      3.3   5.9   7.1     25.2
## 3     NA     NA      1        322   7.1  14.3      0.6   0.3  77.6     54.4
## 4     28   1322      0       6917  74.2  10.7      4.6   4.0   6.5     15.0
## 5     18    935      0       4189   1.5  93.8      1.0   0.3   3.5      7.7
## 6     28   1278      0      32387  78.5  10.1      4.7   1.2   5.6      7.9
##   NetPrice  Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1    15184 22886      9857     18236       9227        7298      6983
## 2    17535 24129      8328     19032      11612       17235     10640
## 3     9649 15080      6900      6900      14738        5265      3866
## 4    19986 22108     10280     21480       8727        9748      9391
## 5    12874 19413     11068     19396       9003        7983      7399
## 6    21973 28836     10780     28100      13574       10894     10016
##   FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1        71.3 71.0    23.96 1068   56.4     36.6      23.6
## 2        89.9 35.3    52.92 3755   63.9     34.1      34.5
## 3       100.0 74.2    18.18  109   64.9     51.3      15.0
## 4        64.6 27.7    48.62 1347   47.6     31.0      44.8
## 5        54.2 73.8    27.69 1294   61.3     34.3      22.1
## 6        74.0 18.0    67.87 6430   61.5     22.6      66.7
mean(college$FacSalary, na.rm = TRUE)
## [1] 7465.778
hist(college$FacSalary, main = "Faculty Salary", xlab = "Salary", na.rm = TRUE)
## Warning in plot.window(xlim, ylim, "", ...): "na.rm" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "na.rm" is not a graphical parameter
## Warning in axis(1, ...): "na.rm" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "na.rm" is not a graphical parameter

Q2) What is the median completion rate of colleges by region?

median(college$CompRate, college$Region, na.rm = TRUE)
## [1] 52.45
boxplot(college$CompRate ~ college$Region, xlab = "Region", ylab = "Completion Rate",na.rm = TRUE)

Q3) How does the mean debt of students vary between public and private institutions?

# Using tapply to find the mean debt
mean_debt <- tapply(college$Debt, college$Control, mean, na.rm = TRUE)
print(mean_debt)
##   Private    Profit    Public 
##  724.6099 9451.8917 3585.4286
# Convert the result to a data frame
mean_debt_df <- data.frame(
  Control = names(mean_debt),
  MeanDebt = as.numeric(mean_debt)
)

# Load ggplot2 library
library(ggplot2)

# Create a bar plot
ggplot(mean_debt_df, aes(x = Control, y = MeanDebt, fill = Control)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Student Debt by Institution Type",
       x = "Institution Type (Public/Private)",
       y = "Mean Debt") +
  theme_minimal()

Q4) What is the variance in net price among colleges in different states?

# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Convert 'NetPrice' to numeric (if needed)
college$NetPrice <- as.numeric(college$NetPrice)

# Convert 'State' to a factor (if needed)
college$State <- as.factor(college$State)

# Remove rows with missing values in relevant columns
college_clean <- college %>%
  filter(!is.na(NetPrice), !is.na(State))

# Calculate the variance of net price by state
variance_net_price <- college_clean %>%
  group_by(State) %>%
  summarise(Variance = var(NetPrice, na.rm = TRUE))

# Print the variance results
print(variance_net_price)
## # A tibble: 54 × 2
##    State   Variance
##    <fct>      <dbl>
##  1 AK     11981682.
##  2 AL     29085554.
##  3 AR     12062511.
##  4 AZ     50634889.
##  5 CA    107756295.
##  6 CO     34961444.
##  7 CT     96643527.
##  8 DC     53448945.
##  9 DE     19749702.
## 10 FL     69457181.
## # ℹ 44 more rows
library(ggplot2)

# Create a bar plot of variance in net price by state
ggplot(variance_net_price, aes(x = reorder(State, Variance), y = Variance)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Variance in Net Price by State",
       x = "State",
       y = "Variance in Net Price") +
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Q5) Is there a correlation between the average SAT score and the admission rate?

# Remove rows with NA values in 'AvgSAT' or 'AdmitRate'
college_clean <- college %>%
  filter(!is.na(AvgSAT), !is.na(AdmitRate))

# Calculate the correlation between AvgSAT and AdmitRate
correlation <- cor(college_clean$AvgSAT, college_clean$AdmitRate)

# Print the correlation result
print(correlation)
## [1] -0.4221255
library(ggplot2)

# Scatter plot with regression line
ggplot(college_clean, aes(x = AvgSAT, y = AdmitRate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Correlation between Average SAT Score and Admission Rate",
       x = "Average SAT Score",
       y = "Admission Rate") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Q6) What percentage of students are female across all colleges, and how does this compare between public and private institutions?

# Load necessary libraries
library(dplyr)

# Convert 'Female' to numeric (if it's not already)
college$Female <- as.numeric(college$Female)

# Check for missing values and remove them
college_clean <- college %>%
  filter(!is.na(Female), !is.na(Control))

# Calculate the overall mean percentage of female students
overall_female_mean <- mean(college_clean$Female, na.rm = TRUE)
cat("Overall mean percentage of female students across all colleges:", round(overall_female_mean, 2), "%\n")
## Overall mean percentage of female students across all colleges: 59.3 %
# Calculate the mean percentage of female students by institution type (Public/Private)
female_mean_by_type <- college_clean %>%
  group_by(Control) %>%
  summarise(MeanFemalePercentage = mean(Female, na.rm = TRUE))

# Print the results
print(female_mean_by_type)
## # A tibble: 3 × 2
##   Control MeanFemalePercentage
##   <chr>                  <dbl>
## 1 Private                 58.6
## 2 Profit                  68.7
## 3 Public                  58.1
# Load the ggplot2 library
library(ggplot2)

# Create a pie chart to compare the mean percentage of female students by institution type
ggplot(female_mean_by_type, aes(x = "", y = MeanFemalePercentage, fill = Control)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(
    title = "Pie Chart of Mean Percentage of Female Students by Institution Type",
    fill = "Institution Type (Public/Private)"
  ) +
  theme_minimal() +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text = element_blank(), axis.ticks = element_blank())

Q7) What is the distribution of mid-ACT scores among all colleges?

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Check the structure of the data
str(college)
## 'data.frame':    2012 obs. of  37 variables:
##  $ Name       : chr  "Alabama A & M University" "University of Alabama at Birmingham" "Amridge University" "University of Alabama in Huntsville" ...
##  $ State      : Factor w/ 54 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ID         : int  100654 100663 100690 100706 100724 100751 100812 100830 100858 100937 ...
##  $ Main       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Accred     : chr  "Southern Association of Colleges and Schools Commission on Colleges" "Southern Association of Colleges and Schools Commission on Colleges" "Southern Association of Colleges and Schools Commission on Colleges" "Southern Association of Colleges and Schools Commission on Colleges" ...
##  $ MainDegree : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ HighDegree : int  4 4 4 4 4 4 3 4 4 3 ...
##  $ Control    : chr  "Public" "Public" "Private" "Public" ...
##  $ Region     : chr  "Southeast" "Southeast" "Southeast" "Southeast" ...
##  $ Locale     : chr  "City" "City" "City" "City" ...
##  $ Latitude   : num  34.8 33.5 32.4 34.7 32.4 ...
##  $ Longitude  : num  -86.6 -86.8 -86.2 -86.6 -86.3 ...
##  $ AdmitRate  : num  0.903 0.918 NA 0.812 0.979 ...
##  $ MidACT     : int  18 25 NA 28 18 28 NA 22 27 26 ...
##  $ AvgSAT     : int  929 1195 NA 1322 935 1278 NA 1083 1282 1231 ...
##  $ Online     : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Enrollment : int  4824 12866 322 6917 4189 32387 2801 4211 23391 1283 ...
##  $ White      : num  2.5 57.8 7.1 74.2 1.5 78.5 77.4 49.6 81.8 79.3 ...
##  $ Black      : num  90.7 25.9 14.3 10.7 93.8 10.1 12.6 38.3 6.3 11.9 ...
##  $ Hispanic   : num  0.9 3.3 0.6 4.6 1 4.7 2.7 1.3 3.4 1.8 ...
##  $ Asian      : num  0.2 5.9 0.3 4 0.3 1.2 0.9 2.6 2.4 5.5 ...
##  $ Other      : num  5.6 7.1 77.6 6.5 3.5 5.6 6.5 8.2 6.2 1.6 ...
##  $ PartTime   : num  6.6 25.2 54.4 15 7.7 7.9 56.9 23.2 8.4 0.9 ...
##  $ NetPrice   : num  15184 17535 9649 19986 12874 ...
##  $ Cost       : int  22886 24129 15080 22108 19413 28836 NA 19892 30458 50440 ...
##  $ TuitionIn  : int  9857 8328 6900 10280 11068 10780 NA 8020 10968 35804 ...
##  $ TuitonOut  : int  18236 19032 6900 21480 19396 28100 NA 17140 29640 35804 ...
##  $ TuitionFTE : int  9227 11612 14738 8727 9003 13574 6713 8709 15479 10088 ...
##  $ InstructFTE: int  7298 17235 5265 9748 7983 10894 8017 7487 12067 10267 ...
##  $ FacSalary  : int  6983 10640 3866 9391 7399 10016 8268 7518 10137 7774 ...
##  $ FullTimeFac: num  71.3 89.9 100 64.6 54.2 74 42.7 97.4 85.5 66.2 ...
##  $ Pell       : num  71 35.3 74.2 27.7 73.8 18 44.6 44.2 14.9 19.2 ...
##  $ CompRate   : num  24 52.9 18.2 48.6 27.7 ...
##  $ Debt       : int  1068 3755 109 1347 1294 6430 913 959 4152 268 ...
##  $ Female     : num  56.4 63.9 64.9 47.6 61.3 61.5 70.5 69.3 53.2 52 ...
##  $ FirstGen   : num  36.6 34.1 51.3 31 34.3 22.6 47.7 38.2 17.3 17.2 ...
##  $ MedIncome  : num  23.6 34.5 15 44.8 22.1 66.7 29.6 29.7 72 68.1 ...
# Convert 'MidACT' to numeric (if needed)
college$MidACT <- as.numeric(college$MidACT)

# Remove any missing values
college_clean <- college %>%
  filter(!is.na(MidACT))

# Summary statistics of MidACT scores
summary(college_clean$MidACT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   21.00   23.00   23.54   25.00   35.00
# Create a histogram to show the distribution of MidACT scores
ggplot(college_clean, aes(x = MidACT)) +
  geom_histogram(bins = 30, color = "black", fill = "skyblue") +
  labs(title = "Distribution of Mid-ACT Scores Among All Colleges",
       x = "Mid-ACT Score",
       y = "Frequency") +
  theme_minimal()

Q8) How does the median income of students’ families differ by control type (Public vs. Private)?

median(college$MedIncome, college$Control, na.rm = TRUE)
## [1] 42.6
boxplot(college$MedIncome ~ college$Control, xlab = "Institution Type", ylab = "Median Income", na.rm = TRUE)

Q9) What is the standard deviation of tuition fees for in-state students across all colleges?

# Calculate the standard deviation of tuition fees for in-state students
std_dev_tuition <- sd(college$TuitionIn, na.rm = TRUE)

# Display the result
print(std_dev_tuition)
## [1] 14130.3
# Load the ggplot2 library
library(ggplot2)

# Calculate the standard deviation of in-state tuition fees
std_dev_tuition <- sd(college$TuitionIn, na.rm = TRUE)

# Print the standard deviation
print(paste("Standard Deviation of In-State Tuition Fees:", round(std_dev_tuition, 2)))
## [1] "Standard Deviation of In-State Tuition Fees: 14130.3"
# Create a histogram of the in-state tuition fees
ggplot(college, aes(x = TuitionIn)) +
  geom_histogram(binwidth = 1000, fill = "skyblue", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = std_dev_tuition), color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Histogram of In-State Tuition Fees",
    x = "In-State Tuition Fees ($)",
    y = "Frequency"
  ) +
  theme_minimal() +
  annotate("text", x = std_dev_tuition, y = 10, label = paste("SD =", round(std_dev_tuition, 2)), color = "red", angle = 90, vjust = -0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 94 rows containing non-finite outside the scale range
## (`stat_bin()`).

Q10) What is the mean percentage of first-generation students in each region?

mean(college$FirstGen, na.rm = TRUE)
## [1] 33.55713
hist(college$FirstGen, main = "First Generation Students by Region", xlab = "First Genteration Students", na.rm = TRUE)
## Warning in plot.window(xlim, ylim, "", ...): "na.rm" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "na.rm" is not a graphical parameter
## Warning in axis(1, ...): "na.rm" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "na.rm" is not a graphical parameter