Introduction

This report analyzes the “CollegeScores4yr.csv” dataset using descriptive statistics methods from Chapter 6. The following 10 questions are addressed from the dataset:

  1. What is the average college admission rate (“AdmitRate”)?
  2. What is the median MidACT score (“MidACT”) across colleges?
  3. What is the standard deviation of the average SAT score (“AvgSAT”)?
  4. What is the variance in college enrollment (“Enrollment”)?
  5. What is the correlation between the MidACT score (“MidACT”) and the average SAT score (“AvgSAT”)?
  6. What does the histogram of net price (“NetPrice”) look like?
  7. How does the boxplot of faculty salary (“FacSalary”) appear across colleges?
  8. What is the bar plot of college control types (“Control”)?
  9. What is the correlation between cost (“Cost”) and debt (“Debt”)?
  10. What is the average median income (“MedIncome”) for the colleges?

Analysis

# Load  library
library(ggplot2)

# Read dataset
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv", 
                    header = TRUE, stringsAsFactors = FALSE)


print("Column names:")
## [1] "Column names:"
print(names(college))
##  [1] "Name"        "State"       "ID"          "Main"        "Accred"     
##  [6] "MainDegree"  "HighDegree"  "Control"     "Region"      "Locale"     
## [11] "Latitude"    "Longitude"   "AdmitRate"   "MidACT"      "AvgSAT"     
## [16] "Online"      "Enrollment"  "White"       "Black"       "Hispanic"   
## [21] "Asian"       "Other"       "PartTime"    "NetPrice"    "Cost"       
## [26] "TuitionIn"   "TuitonOut"   "TuitionFTE"  "InstructFTE" "FacSalary"  
## [31] "FullTimeFac" "Pell"        "CompRate"    "Debt"        "Female"     
## [36] "FirstGen"    "MedIncome"

Question 1: Average Admission Rate (“AdmitRate”)

if("AdmitRate" %in% names(college)) {
  admit_rate <- as.numeric(college$AdmitRate)
  print("Average AdmitRate:")
  print(mean(admit_rate, na.rm = TRUE))
} else {
  print("Column 'AdmitRate' not found.")
}
## [1] "Average AdmitRate:"
## [1] 0.6702025

Question 2: Median MidACT Score (“MidACT”)

if("MidACT" %in% names(college)) {
  midact <- as.numeric(college$MidACT)
  print("Median MidACT Score:")
  print(median(midact, na.rm = TRUE))
} else {
  print("Column 'MidACT' not found.")
}
## [1] "Median MidACT Score:"
## [1] 23

Question 3: Standard Deviation of Average SAT Score (“AvgSAT”)

if("AvgSAT" %in% names(college)) {
  avgsat <- as.numeric(college$AvgSAT)
  print("Standard Deviation of AvgSAT:")
  print(sd(avgsat, na.rm = TRUE))
} else {
  print("Column 'AvgSAT' not found.")
}
## [1] "Standard Deviation of AvgSAT:"
## [1] 128.9077

Question 4: Variance in Enrollment (“Enrollment”)

if("Enrollment" %in% names(college)) {
  enrollment <- as.numeric(college$Enrollment)
  print("Variance in Enrollment:")
  print(var(enrollment, na.rm = TRUE))
} else {
  print("Column 'Enrollment' not found.")
}
## [1] "Variance in Enrollment:"
## [1] 55846805

Question 5: Correlation between MidACT and AvgSAT

if("MidACT" %in% names(college) && "AvgSAT" %in% names(college)) {
  midact <- as.numeric(college$MidACT)
  avgsat <- as.numeric(college$AvgSAT)
  print("Correlation between MidACT and AvgSAT:")
  print(cor(midact, avgsat, use = "complete.obs"))
} else {
  print("Column 'MidACT' and/or 'AvgSAT' not found.")
}
## [1] "Correlation between MidACT and AvgSAT:"
## [1] 0.9820588

Question 6: Histogram of Net Price (“NetPrice”)

if("NetPrice" %in% names(college)) {
  netprice <- as.numeric(gsub("[^0-9\\.]", "", college$NetPrice))
  netprice <- netprice[is.finite(netprice)]
  print("Summary of NetPrice:")
  print(summary(netprice))
  if(length(netprice) > 0) {
    hist(netprice, 
         main = "Histogram of Net Price", 
         xlab = "Net Price", 
         col = "lightblue", 
         border = "black",
         breaks = "Sturges")
  } else {
    print("NetPrice data is not available or not numeric.")
  }
} else {
  print("Column 'NetPrice' not found.")
}
## [1] "Summary of NetPrice:"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     923   14494   19338   19887   24443   55775

Question 7: Boxplot of Faculty Salary (“FacSalary”)

if("FacSalary" %in% names(college)) {
  facsalary <- as.numeric(college$FacSalary)
  boxplot(facsalary, 
          main = "Boxplot of Faculty Salary", 
          ylab = "Faculty Salary")
} else {
  print("Column 'FacSalary' not found.")
}

Question 8: Bar Plot of College Control (“Control”)

if("Control" %in% names(college)) {
  control_table <- table(college$Control)
  barplot(control_table, 
          main = "Bar Plot of College Control Types", 
          xlab = "Control", 
          ylab = "Frequency", 
          col = "lightgreen")
} else {
  print("Column 'Control' not found.")
}

Question 9: Correlation between Cost (“Cost”) and Debt (“Debt”)

if("Cost" %in% names(college) && "Debt" %in% names(college)) {
  cost <- as.numeric(gsub("[^0-9\\.]", "", college$Cost))
  debt <- as.numeric(gsub("[^0-9\\.]", "", college$Debt))
  print("Correlation between Cost and Debt:")
  print(cor(cost, debt, use = "complete.obs"))
} else {
  print("Column 'Cost' and/or 'Debt' not found.")
}
## [1] "Correlation between Cost and Debt:"
## [1] -0.2144525

Question 10: Average Median Income (“MedIncome”)

if("MedIncome" %in% names(college)) {
  medincome <- as.numeric(gsub("[^0-9\\.]", "", college$MedIncome))
  print("Average MedIncome:")
  print(mean(medincome, na.rm = TRUE))
} else {
  print("Column 'MedIncome' not found.")
}
## [1] "Average MedIncome:"
## [1] 46.51453

Conclusion

After analyzing the data provided by CollegeScores4yr, insight into 4 year colleges are discovered:

The insights found within this data provide snapshots of the academics, finances, and institutional landscape of the institution provided.

Appendix

# Load  library
library(ggplot2)

# Read dataset
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv", 
                    header = TRUE, stringsAsFactors = FALSE)


print(names(college))

# Question 1: Average College Admission Rate (AdmitRate)
if("AdmitRate" %in% names(college)) {
  admit_rate <- as.numeric(college$AdmitRate)
  avg_admit_rate <- mean(admit_rate, na.rm = TRUE)
  print(avg_admit_rate)
}

# Question 2: Median MidACT Score (MidACT)
if("MidACT" %in% names(college)) {
  midact <- as.numeric(college$MidACT)
  med_midact <- median(midact, na.rm = TRUE)
  print(med_midact)
}

# Question 3: Correlation between MidACT and AvgSAT
if("MidACT" %in% names(college) && "AvgSAT" %in% names(college)) {
  midact <- as.numeric(college$MidACT)
  avgsat <- as.numeric(college$AvgSAT)
  corr_midact_avgsat <- cor(midact, avgsat, use = "complete.obs")
  print(corr_midact_avgsat)
}

# Question 4: Variance in Enrollment
if("Enrollment" %in% names(college)) {
  enrollment <- as.numeric(college$Enrollment)
  var_enrollment <- var(enrollment, na.rm = TRUE)
  print(var_enrollment)
}

# Question 5: Distribution of College Control Types
if("Control" %in% names(college)) {
  control_table <- table(college$Control)
  print(control_table)
  pie(control_table, main = "Pie Chart of College Control", col = rainbow(length(control_table)))
}

# Question 6: Average Net Price and Histogram (NetPrice)
if("NetPrice" %in% names(college)) {
  netprice <- as.numeric(gsub("[^0-9\\.]", "", college$NetPrice))
  netprice <- netprice[is.finite(netprice)]
  avg_netprice <- mean(netprice, na.rm = TRUE)
  print(avg_netprice)
  hist(netprice, main = "Histogram of Net Price", xlab = "Net Price", 
       col = "lightblue", border = "black", breaks = "Sturges")
}

# Question 7: Average In-State Tuition Cost (TuitionIn)
if("TuitionIn" %in% names(college)) {
  tuition_in <- as.numeric(gsub("[^0-9\\.]", "", college$TuitionIn))
  tuition_in <- tuition_in[is.finite(tuition_in)]
  avg_tuition_in <- mean(tuition_in, na.rm = TRUE)
  print(avg_tuition_in)
}

# Question 8: Average Faculty Salary (FacSalary)
if("FacSalary" %in% names(college)) {
  facsalary <- as.numeric(college$FacSalary)
  avg_facsalary <- mean(facsalary, na.rm = TRUE)
  print(avg_facsalary)
}

# Question 9: Correlation between Cost and Debt
if("Cost" %in% names(college) && "Debt" %in% names(college)) {
  cost <- as.numeric(gsub("[^0-9\\.]", "", college$Cost))
  debt <- as.numeric(gsub("[^0-9\\.]", "", college$Debt))
  corr_cost_debt <- cor(cost, debt, use = "complete.obs")
  print(corr_cost_debt)
}

# Question 10: Average Median Income (MedIncome)
if("MedIncome" %in% names(college)) {
  medincome <- as.numeric(gsub("[^0-9\\.]", "", college$MedIncome))
  medincome <- medincome[is.finite(medincome)]
  avg_medincome <- mean(medincome, na.rm = TRUE)
  print(avg_medincome)
}