This report analyzes the CollegeScores4yr dataset using descriptive statistics methods from Chapter 6, including measures of center, spread, and visualizations.
I propse the following 10 questions based on my own understanding of the data.
we will explore the questions in detail:
college = read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
mean_cost <- mean(college$Cost, na.rm = TRUE)
admit_median <- median(college$AdmitRate, na.rm = TRUE)
sat_sd <- sd(college$AvgSAT, na.rm = TRUE)
hist(college$Cost, breaks = 20, col = "skyblue",
main = "College Costs Distribution", xlab = "Cost ($)")
type_counts <- table(college$Control)
pie(type_counts,
labels = paste0(names(type_counts), "\n", round(prop.table(type_counts)*100, 1), "%"),
col = c("lightblue", "lightgreen"),
main = paste("Institution Control (n =", sum(type_counts), ")"))
ccor_cost_sat <- cor(college$Cost, college$AvgSAT, use = "complete.obs")
boxplot(CompRate ~ Control, data = college, # Your grad rate column is CompRate
col = c("lightblue", "lightgreen"),
main = "Completion Rates by Institution Control")
hist(college$FacSalary, col = "gold", # Your column is FacSalary
main = "Faculty Salaries", xlab = "Salary ($)")
sf_ratio <- median(college$Enrollment / college$InstructFTE, na.rm = TRUE)
plot(college$AdmitRate, college$CompRate,
col = rgb(0.5, 0, 0.8, 0.5), pch = 19,
xlab = "Admission Rate", ylab = "Completion Rate")
# Full analysis code
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
# Q1-Q10 code chunks
mean(college$Cost, na.rm = TRUE)
## [1] 34277.31
median(college$AdmitRate, na.rm = TRUE)
## [1] 0.69505
sd(college$AvgSAT, na.rm = TRUE)
## [1] 128.9077
hist(college$Cost)
pie(table(college$Control))
cor(college$Cost, college$AvgSAT, use = "complete.obs")
## [1] 0.5373884
boxplot(CompRate ~ Control, data = college)
hist(college$FacSalary)
median(college$Enrollment/college$InstructFTE, na.rm = TRUE)
## [1] 0.1979082
plot(college$AdmitRate, college$CompRate)