Using the data from lock5stat.com, I propose the the following 10 questions for this project.
Here, we will explore these questions in detail.
college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
head(college)
## Name State ID Main
## 1 Alabama A & M University AL 100654 1
## 2 University of Alabama at Birmingham AL 100663 1
## 3 Amridge University AL 100690 1
## 4 University of Alabama in Huntsville AL 100706 1
## 5 Alabama State University AL 100724 1
## 6 The University of Alabama AL 100751 1
## Accred
## 1 Southern Association of Colleges and Schools Commission on Colleges
## 2 Southern Association of Colleges and Schools Commission on Colleges
## 3 Southern Association of Colleges and Schools Commission on Colleges
## 4 Southern Association of Colleges and Schools Commission on Colleges
## 5 Southern Association of Colleges and Schools Commission on Colleges
## 6 Southern Association of Colleges and Schools Commission on Colleges
## MainDegree HighDegree Control Region Locale Latitude Longitude AdmitRate
## 1 3 4 Public Southeast City 34.78337 -86.56850 0.9027
## 2 3 4 Public Southeast City 33.50570 -86.79935 0.9181
## 3 3 4 Private Southeast City 32.36261 -86.17401 NA
## 4 3 4 Public Southeast City 34.72456 -86.64045 0.8123
## 5 3 4 Public Southeast City 32.36432 -86.29568 0.9787
## 6 3 4 Public Southeast City 33.21187 -87.54598 0.5330
## MidACT AvgSAT Online Enrollment White Black Hispanic Asian Other PartTime
## 1 18 929 0 4824 2.5 90.7 0.9 0.2 5.6 6.6
## 2 25 1195 0 12866 57.8 25.9 3.3 5.9 7.1 25.2
## 3 NA NA 1 322 7.1 14.3 0.6 0.3 77.6 54.4
## 4 28 1322 0 6917 74.2 10.7 4.6 4.0 6.5 15.0
## 5 18 935 0 4189 1.5 93.8 1.0 0.3 3.5 7.7
## 6 28 1278 0 32387 78.5 10.1 4.7 1.2 5.6 7.9
## NetPrice Cost TuitionIn TuitonOut TuitionFTE InstructFTE FacSalary
## 1 15184 22886 9857 18236 9227 7298 6983
## 2 17535 24129 8328 19032 11612 17235 10640
## 3 9649 15080 6900 6900 14738 5265 3866
## 4 19986 22108 10280 21480 8727 9748 9391
## 5 12874 19413 11068 19396 9003 7983 7399
## 6 21973 28836 10780 28100 13574 10894 10016
## FullTimeFac Pell CompRate Debt Female FirstGen MedIncome
## 1 71.3 71.0 23.96 1068 56.4 36.6 23.6
## 2 89.9 35.3 52.92 3755 63.9 34.1 34.5
## 3 100.0 74.2 18.18 109 64.9 51.3 15.0
## 4 64.6 27.7 48.62 1347 47.6 31.0 44.8
## 5 54.2 73.8 27.69 1294 61.3 34.3 22.1
## 6 74.0 18.0 67.87 6430 61.5 22.6 66.7
mean(college$FacSalary, na.rm = TRUE)
## [1] 7465.778
hist(college$FacSalary, main = "Faculty Salary", xlab = "Salary", na.rm = TRUE)
## Warning in plot.window(xlim, ylim, "", ...): "na.rm" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "na.rm" is not a graphical parameter
## Warning in axis(1, ...): "na.rm" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "na.rm" is not a graphical parameter
median(college$CompRate, college$Region, na.rm = TRUE)
## [1] 52.45
boxplot(college$CompRate ~ college$Region, xlab = "Region", ylab = "Completion Rate",na.rm = TRUE)
# Using tapply to find the mean debt
mean_debt <- tapply(college$Debt, college$Control, mean, na.rm = TRUE)
print(mean_debt)
## Private Profit Public
## 724.6099 9451.8917 3585.4286
# Convert the result to a data frame
mean_debt_df <- data.frame(
Control = names(mean_debt),
MeanDebt = as.numeric(mean_debt)
)
# Load ggplot2 library
library(ggplot2)
# Create a bar plot
ggplot(mean_debt_df, aes(x = Control, y = MeanDebt, fill = Control)) +
geom_bar(stat = "identity") +
labs(title = "Mean Student Debt by Institution Type",
x = "Institution Type (Public/Private)",
y = "Mean Debt") +
theme_minimal()
# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Convert 'NetPrice' to numeric (if needed)
college$NetPrice <- as.numeric(college$NetPrice)
# Convert 'State' to a factor (if needed)
college$State <- as.factor(college$State)
# Remove rows with missing values in relevant columns
college_clean <- college %>%
filter(!is.na(NetPrice), !is.na(State))
# Calculate the variance of net price by state
variance_net_price <- college_clean %>%
group_by(State) %>%
summarise(Variance = var(NetPrice, na.rm = TRUE))
# Print the variance results
print(variance_net_price)
## # A tibble: 54 × 2
## State Variance
## <fct> <dbl>
## 1 AK 11981682.
## 2 AL 29085554.
## 3 AR 12062511.
## 4 AZ 50634889.
## 5 CA 107756295.
## 6 CO 34961444.
## 7 CT 96643527.
## 8 DC 53448945.
## 9 DE 19749702.
## 10 FL 69457181.
## # ℹ 44 more rows
library(ggplot2)
# Create a bar plot of variance in net price by state
ggplot(variance_net_price, aes(x = reorder(State, Variance), y = Variance)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Variance in Net Price by State",
x = "State",
y = "Variance in Net Price") +
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Remove rows with NA values in 'AvgSAT' or 'AdmitRate'
college_clean <- college %>%
filter(!is.na(AvgSAT), !is.na(AdmitRate))
# Calculate the correlation between AvgSAT and AdmitRate
correlation <- cor(college_clean$AvgSAT, college_clean$AdmitRate)
# Print the correlation result
print(correlation)
## [1] -0.4221255
library(ggplot2)
# Scatter plot with regression line
ggplot(college_clean, aes(x = AvgSAT, y = AdmitRate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Correlation between Average SAT Score and Admission Rate",
x = "Average SAT Score",
y = "Admission Rate") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Load necessary libraries
library(dplyr)
# Convert 'Female' to numeric (if it's not already)
college$Female <- as.numeric(college$Female)
# Check for missing values and remove them
college_clean <- college %>%
filter(!is.na(Female), !is.na(Control))
# Calculate the overall mean percentage of female students
overall_female_mean <- mean(college_clean$Female, na.rm = TRUE)
cat("Overall mean percentage of female students across all colleges:", round(overall_female_mean, 2), "%\n")
## Overall mean percentage of female students across all colleges: 59.3 %
# Calculate the mean percentage of female students by institution type (Public/Private)
female_mean_by_type <- college_clean %>%
group_by(Control) %>%
summarise(MeanFemalePercentage = mean(Female, na.rm = TRUE))
# Print the results
print(female_mean_by_type)
## # A tibble: 3 × 2
## Control MeanFemalePercentage
## <chr> <dbl>
## 1 Private 58.6
## 2 Profit 68.7
## 3 Public 58.1
# Load the ggplot2 library
library(ggplot2)
# Create a pie chart to compare the mean percentage of female students by institution type
ggplot(female_mean_by_type, aes(x = "", y = MeanFemalePercentage, fill = Control)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(
title = "Pie Chart of Mean Percentage of Female Students by Institution Type",
fill = "Institution Type (Public/Private)"
) +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text = element_blank(), axis.ticks = element_blank())
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Check the structure of the data
str(college)
## 'data.frame': 2012 obs. of 37 variables:
## $ Name : chr "Alabama A & M University" "University of Alabama at Birmingham" "Amridge University" "University of Alabama in Huntsville" ...
## $ State : Factor w/ 54 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ ID : int 100654 100663 100690 100706 100724 100751 100812 100830 100858 100937 ...
## $ Main : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Accred : chr "Southern Association of Colleges and Schools Commission on Colleges" "Southern Association of Colleges and Schools Commission on Colleges" "Southern Association of Colleges and Schools Commission on Colleges" "Southern Association of Colleges and Schools Commission on Colleges" ...
## $ MainDegree : int 3 3 3 3 3 3 3 3 3 3 ...
## $ HighDegree : int 4 4 4 4 4 4 3 4 4 3 ...
## $ Control : chr "Public" "Public" "Private" "Public" ...
## $ Region : chr "Southeast" "Southeast" "Southeast" "Southeast" ...
## $ Locale : chr "City" "City" "City" "City" ...
## $ Latitude : num 34.8 33.5 32.4 34.7 32.4 ...
## $ Longitude : num -86.6 -86.8 -86.2 -86.6 -86.3 ...
## $ AdmitRate : num 0.903 0.918 NA 0.812 0.979 ...
## $ MidACT : int 18 25 NA 28 18 28 NA 22 27 26 ...
## $ AvgSAT : int 929 1195 NA 1322 935 1278 NA 1083 1282 1231 ...
## $ Online : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Enrollment : int 4824 12866 322 6917 4189 32387 2801 4211 23391 1283 ...
## $ White : num 2.5 57.8 7.1 74.2 1.5 78.5 77.4 49.6 81.8 79.3 ...
## $ Black : num 90.7 25.9 14.3 10.7 93.8 10.1 12.6 38.3 6.3 11.9 ...
## $ Hispanic : num 0.9 3.3 0.6 4.6 1 4.7 2.7 1.3 3.4 1.8 ...
## $ Asian : num 0.2 5.9 0.3 4 0.3 1.2 0.9 2.6 2.4 5.5 ...
## $ Other : num 5.6 7.1 77.6 6.5 3.5 5.6 6.5 8.2 6.2 1.6 ...
## $ PartTime : num 6.6 25.2 54.4 15 7.7 7.9 56.9 23.2 8.4 0.9 ...
## $ NetPrice : num 15184 17535 9649 19986 12874 ...
## $ Cost : int 22886 24129 15080 22108 19413 28836 NA 19892 30458 50440 ...
## $ TuitionIn : int 9857 8328 6900 10280 11068 10780 NA 8020 10968 35804 ...
## $ TuitonOut : int 18236 19032 6900 21480 19396 28100 NA 17140 29640 35804 ...
## $ TuitionFTE : int 9227 11612 14738 8727 9003 13574 6713 8709 15479 10088 ...
## $ InstructFTE: int 7298 17235 5265 9748 7983 10894 8017 7487 12067 10267 ...
## $ FacSalary : int 6983 10640 3866 9391 7399 10016 8268 7518 10137 7774 ...
## $ FullTimeFac: num 71.3 89.9 100 64.6 54.2 74 42.7 97.4 85.5 66.2 ...
## $ Pell : num 71 35.3 74.2 27.7 73.8 18 44.6 44.2 14.9 19.2 ...
## $ CompRate : num 24 52.9 18.2 48.6 27.7 ...
## $ Debt : int 1068 3755 109 1347 1294 6430 913 959 4152 268 ...
## $ Female : num 56.4 63.9 64.9 47.6 61.3 61.5 70.5 69.3 53.2 52 ...
## $ FirstGen : num 36.6 34.1 51.3 31 34.3 22.6 47.7 38.2 17.3 17.2 ...
## $ MedIncome : num 23.6 34.5 15 44.8 22.1 66.7 29.6 29.7 72 68.1 ...
# Convert 'MidACT' to numeric (if needed)
college$MidACT <- as.numeric(college$MidACT)
# Remove any missing values
college_clean <- college %>%
filter(!is.na(MidACT))
# Summary statistics of MidACT scores
summary(college_clean$MidACT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 21.00 23.00 23.54 25.00 35.00
# Create a histogram to show the distribution of MidACT scores
ggplot(college_clean, aes(x = MidACT)) +
geom_histogram(bins = 30, color = "black", fill = "skyblue") +
labs(title = "Distribution of Mid-ACT Scores Among All Colleges",
x = "Mid-ACT Score",
y = "Frequency") +
theme_minimal()
median(college$MedIncome, college$Control, na.rm = TRUE)
## [1] 42.6
boxplot(college$MedIncome ~ college$Control, xlab = "Institution Type", ylab = "Median Income", na.rm = TRUE)
# Calculate the standard deviation of tuition fees for in-state students
std_dev_tuition <- sd(college$TuitionIn, na.rm = TRUE)
# Display the result
print(std_dev_tuition)
## [1] 14130.3
# Load the ggplot2 library
library(ggplot2)
# Calculate the standard deviation of in-state tuition fees
std_dev_tuition <- sd(college$TuitionIn, na.rm = TRUE)
# Print the standard deviation
print(paste("Standard Deviation of In-State Tuition Fees:", round(std_dev_tuition, 2)))
## [1] "Standard Deviation of In-State Tuition Fees: 14130.3"
# Create a histogram of the in-state tuition fees
ggplot(college, aes(x = TuitionIn)) +
geom_histogram(binwidth = 1000, fill = "skyblue", color = "black", alpha = 0.7) +
geom_vline(aes(xintercept = std_dev_tuition), color = "red", linetype = "dashed", size = 1) +
labs(
title = "Histogram of In-State Tuition Fees",
x = "In-State Tuition Fees ($)",
y = "Frequency"
) +
theme_minimal() +
annotate("text", x = std_dev_tuition, y = 10, label = paste("SD =", round(std_dev_tuition, 2)), color = "red", angle = 90, vjust = -0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 94 rows containing non-finite outside the scale range
## (`stat_bin()`).
mean(college$FirstGen, na.rm = TRUE)
## [1] 33.55713
hist(college$FirstGen, main = "First Generation Students by Region", xlab = "First Genteration Students", na.rm = TRUE)
## Warning in plot.window(xlim, ylim, "", ...): "na.rm" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "na.rm" is not a graphical parameter
## Warning in axis(1, ...): "na.rm" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "na.rm" is not a graphical parameter