Assignment 7

Packages

library(ggplot2)
college <- read.csv("~/Desktop/BANA 4137/college.csv")

QUESTION 1:

num_variables <- ncol(college)
num_observations <- nrow(college)
num_variables
## [1] 17
num_observations
## [1] 1269
missing_values <- any(is.na(college))
missing_values
## [1] TRUE

QUESTION 2:

avg_admission_rate <- aggregate(admission_rate ~ state, college, mean)
barplot(avg_admission_rate$admission_rate, names.arg = avg_admission_rate$state,
        xlab = "State", ylab = "Average Admission Rate",
        main = "Average Admission Rate by State",
        col = "pink")

QUESTION 3:

ggplot(college, aes(x = median_debt, y = loan_default_rate)) +
  geom_point(color = "brown") +
  geom_smooth(method = "gam") +
  xlab("Median Debt") +
  ylab("Loan Default Rate") +
  ggtitle("Scatter Plot: Median Debt vs. Loan Default Rate")
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).

QUESTION 4:

ggplot(college, aes(x = median_debt, y = loan_default_rate, size = undergrads)) +
  geom_point(color = "red") +
  geom_smooth(method = "gam", aes(weight = undergrads)) +
  xlab("Median Debt") +
  ylab("Loan Default Rate") +
  ggtitle("Bubble Plot: Median Debt vs. Loan Default Rate (Weighted)")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 2 rows containing missing values (`geom_point()`).

QUESTION 5:

The weighted trend in question 4 will provide a more accurate representation as it shows the number of undergrads.

QUESTION 6:

ggplot(college, aes(x = log10(undergrads), fill = highest_degree)) +
  geom_density(alpha = 0.5) +
  xlab("Log10(Undergrads)") +
  ylab("Density") +
  ggtitle("Overlapping Density Plot: Undergrad Distribution by Highest Degree")

ggplot(college, aes(x = undergrads, fill = highest_degree)) +
  geom_density(alpha = 0.5) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = "Density Plot of Undergrads by Highest Degree",
       x = "Number of Undergrads",
       y = "Density") +
  theme_minimal()

QUESTION 7:

ggplot(college, aes(x = log10(undergrads), fill = highest_degree)) +
  geom_density(alpha = 0.5, position = "stack") +
  xlab("Log10(Undergrads)") +
  ylab("Density") +
  ggtitle("Stacked Density Plot: Composition of Undergrad Distribution by Highest Degree")

QUESTION 8:

school_counts <- table(college$highest_degree)
school_counts
## 
## Associate  Bachelor  Graduate 
##        20       200      1049
barplot(school_counts, main = "Number of Schools by Highest Degree Type",
        xlab = "Highest Degree Type", ylab = "Number of Schools",
        col = rainbow(length(school_counts)))

QUESTION 9:

ggplot(college, aes(x = log10(undergrads), fill = highest_degree)) +
  geom_density(aes(weight = ..count..), alpha = 0.5) +
  labs(title = "Proportional Composition of Undergrads by Highest Degree",
       x = "log10(Number of Undergrads)",
       y = "Density") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

QUESTION 10:

A scatter plot to visualize the relationship between admission rate and SAT average, with color representing the control variable and shape representing the region.

ggplot(college, aes(x = sat_avg, y = admission_rate)) +
  geom_point(aes(color = control, shape = region), size = 3, alpha = 0.7) +
  scale_color_manual(values = c("blue", "red")) +  
  scale_shape_manual(values = c(16, 17, 18, 19)) +  
  labs(x = "SAT Average", y = "Admission Rate") +  
  ggtitle("Relationship between Admission Rate and SAT Average by Control and Region")  

QUESTION 11:

Question 1: What is the relationship between the number of undergraduates and the median SAT score, considering the control and region of the institution, while differentiating by the highest degree offered?

ggplot(college, aes(x = undergrads, y = sat_avg, color = control, shape = highest_degree)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(x = "Number of Undergraduates", y = "Median SAT Score", color = "Control", shape = "Highest Degree") +
  ggtitle("Relationship between Number of Undergraduates and Median SAT Score by Institution Type and Highest Degree")

Question 2: How does the loan default rate vary across different regions and types of institutions?

ggplot(college, aes(x = region, y = loan_default_rate, fill = control)) +
  geom_boxplot() +
  labs(x = "Region", y = "Loan Default Rate", fill = "Control") +
  ggtitle("Loan Default Rate across Regions by Institution Type")
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).