This report analyzes the “CollegeScores4yr.csv” dataset using descriptive statistics methods from Chapter 6. The following 10 questions are addressed from the dataset:
# Load library
library(ggplot2)
# Read dataset
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv",
header = TRUE, stringsAsFactors = FALSE)
print("Column names:")
## [1] "Column names:"
print(names(college))
## [1] "Name" "State" "ID" "Main" "Accred"
## [6] "MainDegree" "HighDegree" "Control" "Region" "Locale"
## [11] "Latitude" "Longitude" "AdmitRate" "MidACT" "AvgSAT"
## [16] "Online" "Enrollment" "White" "Black" "Hispanic"
## [21] "Asian" "Other" "PartTime" "NetPrice" "Cost"
## [26] "TuitionIn" "TuitonOut" "TuitionFTE" "InstructFTE" "FacSalary"
## [31] "FullTimeFac" "Pell" "CompRate" "Debt" "Female"
## [36] "FirstGen" "MedIncome"
if("AdmitRate" %in% names(college)) {
admit_rate <- as.numeric(college$AdmitRate)
print("Average AdmitRate:")
print(mean(admit_rate, na.rm = TRUE))
} else {
print("Column 'AdmitRate' not found.")
}
## [1] "Average AdmitRate:"
## [1] 0.6702025
if("MidACT" %in% names(college)) {
midact <- as.numeric(college$MidACT)
print("Median MidACT Score:")
print(median(midact, na.rm = TRUE))
} else {
print("Column 'MidACT' not found.")
}
## [1] "Median MidACT Score:"
## [1] 23
if("AvgSAT" %in% names(college)) {
avgsat <- as.numeric(college$AvgSAT)
print("Standard Deviation of AvgSAT:")
print(sd(avgsat, na.rm = TRUE))
} else {
print("Column 'AvgSAT' not found.")
}
## [1] "Standard Deviation of AvgSAT:"
## [1] 128.9077
if("Enrollment" %in% names(college)) {
enrollment <- as.numeric(college$Enrollment)
print("Variance in Enrollment:")
print(var(enrollment, na.rm = TRUE))
} else {
print("Column 'Enrollment' not found.")
}
## [1] "Variance in Enrollment:"
## [1] 55846805
if("MidACT" %in% names(college) && "AvgSAT" %in% names(college)) {
midact <- as.numeric(college$MidACT)
avgsat <- as.numeric(college$AvgSAT)
print("Correlation between MidACT and AvgSAT:")
print(cor(midact, avgsat, use = "complete.obs"))
} else {
print("Column 'MidACT' and/or 'AvgSAT' not found.")
}
## [1] "Correlation between MidACT and AvgSAT:"
## [1] 0.9820588
if("NetPrice" %in% names(college)) {
netprice <- as.numeric(gsub("[^0-9\\.]", "", college$NetPrice))
netprice <- netprice[is.finite(netprice)]
print("Summary of NetPrice:")
print(summary(netprice))
if(length(netprice) > 0) {
hist(netprice,
main = "Histogram of Net Price",
xlab = "Net Price",
col = "lightblue",
border = "black",
breaks = "Sturges")
} else {
print("NetPrice data is not available or not numeric.")
}
} else {
print("Column 'NetPrice' not found.")
}
## [1] "Summary of NetPrice:"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 923 14494 19338 19887 24443 55775
if("FacSalary" %in% names(college)) {
facsalary <- as.numeric(college$FacSalary)
boxplot(facsalary,
main = "Boxplot of Faculty Salary",
ylab = "Faculty Salary")
} else {
print("Column 'FacSalary' not found.")
}
if("Control" %in% names(college)) {
control_table <- table(college$Control)
barplot(control_table,
main = "Bar Plot of College Control Types",
xlab = "Control",
ylab = "Frequency",
col = "lightgreen")
} else {
print("Column 'Control' not found.")
}
if("Cost" %in% names(college) && "Debt" %in% names(college)) {
cost <- as.numeric(gsub("[^0-9\\.]", "", college$Cost))
debt <- as.numeric(gsub("[^0-9\\.]", "", college$Debt))
print("Correlation between Cost and Debt:")
print(cor(cost, debt, use = "complete.obs"))
} else {
print("Column 'Cost' and/or 'Debt' not found.")
}
## [1] "Correlation between Cost and Debt:"
## [1] -0.2144525
if("MedIncome" %in% names(college)) {
medincome <- as.numeric(gsub("[^0-9\\.]", "", college$MedIncome))
print("Average MedIncome:")
print(mean(medincome, na.rm = TRUE))
} else {
print("Column 'MedIncome' not found.")
}
## [1] "Average MedIncome:"
## [1] 46.51453
After analyzing the data provided by CollegeScores4yr, insight into 4 year colleges are discovered:
The insights found within this data provide snapshots of the academics, finances, and institutional landscape of the institution provided.
# Load library
library(ggplot2)
# Read dataset
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv",
header = TRUE, stringsAsFactors = FALSE)
print(names(college))
# Question 1: Average College Admission Rate (AdmitRate)
if("AdmitRate" %in% names(college)) {
admit_rate <- as.numeric(college$AdmitRate)
avg_admit_rate <- mean(admit_rate, na.rm = TRUE)
print(avg_admit_rate)
}
# Question 2: Median MidACT Score (MidACT)
if("MidACT" %in% names(college)) {
midact <- as.numeric(college$MidACT)
med_midact <- median(midact, na.rm = TRUE)
print(med_midact)
}
# Question 3: Correlation between MidACT and AvgSAT
if("MidACT" %in% names(college) && "AvgSAT" %in% names(college)) {
midact <- as.numeric(college$MidACT)
avgsat <- as.numeric(college$AvgSAT)
corr_midact_avgsat <- cor(midact, avgsat, use = "complete.obs")
print(corr_midact_avgsat)
}
# Question 4: Variance in Enrollment
if("Enrollment" %in% names(college)) {
enrollment <- as.numeric(college$Enrollment)
var_enrollment <- var(enrollment, na.rm = TRUE)
print(var_enrollment)
}
# Question 5: Distribution of College Control Types
if("Control" %in% names(college)) {
control_table <- table(college$Control)
print(control_table)
pie(control_table, main = "Pie Chart of College Control", col = rainbow(length(control_table)))
}
# Question 6: Average Net Price and Histogram (NetPrice)
if("NetPrice" %in% names(college)) {
netprice <- as.numeric(gsub("[^0-9\\.]", "", college$NetPrice))
netprice <- netprice[is.finite(netprice)]
avg_netprice <- mean(netprice, na.rm = TRUE)
print(avg_netprice)
hist(netprice, main = "Histogram of Net Price", xlab = "Net Price",
col = "lightblue", border = "black", breaks = "Sturges")
}
# Question 7: Average In-State Tuition Cost (TuitionIn)
if("TuitionIn" %in% names(college)) {
tuition_in <- as.numeric(gsub("[^0-9\\.]", "", college$TuitionIn))
tuition_in <- tuition_in[is.finite(tuition_in)]
avg_tuition_in <- mean(tuition_in, na.rm = TRUE)
print(avg_tuition_in)
}
# Question 8: Average Faculty Salary (FacSalary)
if("FacSalary" %in% names(college)) {
facsalary <- as.numeric(college$FacSalary)
avg_facsalary <- mean(facsalary, na.rm = TRUE)
print(avg_facsalary)
}
# Question 9: Correlation between Cost and Debt
if("Cost" %in% names(college) && "Debt" %in% names(college)) {
cost <- as.numeric(gsub("[^0-9\\.]", "", college$Cost))
debt <- as.numeric(gsub("[^0-9\\.]", "", college$Debt))
corr_cost_debt <- cor(cost, debt, use = "complete.obs")
print(corr_cost_debt)
}
# Question 10: Average Median Income (MedIncome)
if("MedIncome" %in% names(college)) {
medincome <- as.numeric(gsub("[^0-9\\.]", "", college$MedIncome))
medincome <- medincome[is.finite(medincome)]
avg_medincome <- mean(medincome, na.rm = TRUE)
print(avg_medincome)
}