Discussion week2

Author

Nuobing Fan

Data Sets

Note we only need to install packages once

library(datasets)
#?datasets()
#?AER
#install.packages("AER")
#install.packages("MASS")
#install.packages("https://cran.r-project.org/src/contrib/Archive/MASS/MASS_7.3-51.6.tar.gz", repos = NULL, type="source")
#install.packages("ggplot2")
#install.packages("dplyr")
library(ggplot2)

Warning: package 'ggplot2' was built under R version 4.3.3

library(AER)

Warning: package 'AER' was built under R version 4.3.3

Loading required package: car

Warning: package 'car' was built under R version 4.3.3

Loading required package: carData

Warning: package 'carData' was built under R version 4.3.3

Loading required package: lmtest

Warning: package 'lmtest' was built under R version 4.3.3

Loading required package: zoo


Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Loading required package: sandwich

Warning: package 'sandwich' was built under R version 4.3.3

Loading required package: survival

library(MASS)
library(dplyr)

Warning: package 'dplyr' was built under R version 4.3.3


Attaching package: 'dplyr'

The following object is masked from 'package:MASS':

    select

The following object is masked from 'package:car':

    recode

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

??AER
??MASS
?CASchools
?Boston

Set1(d1) and Set2(d2)

#Set1
data("CASchools", package = "AER")
#Set2
data("Boston", package = "MASS")

test

#head(CASchools)
#head(Boston)

# Check the structure to verify variable names
#str(CASchools)
#str(Boston)

Describe Data

Set1: CASchools (from the AER package) contains cross-sectional data on 420 California schools, with variables like student-teacher ratio, average test scores, and per-student expenditure. These variables allow people to explore how resource allocation impacts student outcomes.

Set2: Boston (from the MASS package) contains data on housing values in suburbs of Boston, with key variables like the crime rate, average number of rooms, and the proportion of the population considered lower status. This data helps study the factors affecting housing prices.

Type of Data

#note we can use visdat to vidualize data
#install.packages("visdat")
#library(visdat)
#vis_dat(CASchools)
#vis_dat(Boston)
# Function to calculate regression line and add equation to plot
add_regression_eq <- function(data, xvar, yvar) {
  # Remove rows with NA in either x or y variable
  data_clean <- data %>% dplyr::filter(!is.na(.data[[xvar]]) & !is.na(.data[[yvar]]))
  
  # Check if there is enough data to fit a model
  if (nrow(data_clean) < 2) {
    return("N/A")
  }
  
  # Fit the linear model
  model <- try(lm(as.formula(paste(yvar, "~", xvar)), data = data_clean), silent = TRUE)
  
  # If the model fails, return "N/A"
  if (inherits(model, "try-error")) {
    return("N/A")
  }
  
  # Extract coefficients and format the equation
  eq <- paste("y =", format(coef(model)[1], digits = 3), 
              "+", format(coef(model)[2], digits = 3), "x")
  return(eq)
}

CASchools: This dataset contains observations on different schools (420 California schools) at a single point in time, making it cross-sectional. Each row represents data from a different school, and there is no tracking of these schools over time.

# Set1: Scatterplot for CASchools (expenditure per student vs. test score)
# List of variables to plot against student_income
# Exclude "county", "school", and "district" from the variables for individual scatter plots
vars_CASchool <- setdiff(names(CASchools), c("income", "county", "school", "district"))


# Create scatter plots with regression lines for each variable
plots_CASchool <- lapply(vars_CASchool, function(var) {
  ggplot(CASchools, aes_string(x = "income", y = var)) +
    geom_point() +
    geom_smooth(method = "lm", se = TRUE) +
    labs(title = paste("Student Income vs", var),
         x = "income",
         y = var) +
    theme_minimal()
})

Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.

# Print all plots
#for (p in plots_CASchool) print(p)

# Group data by county and calculate the average student income and the number of schools per county
county_summary <- CASchools %>%
  group_by(county) %>%
  summarise(avg_income = mean(income, na.rm = TRUE),
            num_schools = n())

# Scatter plot: average student income per county
plot_county_income <- ggplot(county_summary, aes(x = avg_income, y = reorder(county, avg_income))) +
  geom_point() +
  labs(title = "Average Student Income by County",
       x = "Average Student Income",
       y = "County") +
  theme_minimal()

# Scatter plot: number of schools per county
plot_county_schools <- ggplot(county_summary, aes(x = num_schools, y = reorder(county, num_schools))) +
  geom_point() +
  labs(title = "Number of Schools by County",
       x = "Number of Schools",
       y = "County") +
  theme_minimal()

# Print county plots
print(plot_county_income)

print(plot_county_schools)

# Group data by district and calculate the average student income and the number of schools per district
district_summary <- CASchools %>%
  group_by(district) %>%
  summarise(avg_income = mean(income, na.rm = TRUE),
            num_schools = n())

# Scatter plot: average student income per district
plot_district_income <- ggplot(district_summary, aes(x = avg_income, y = reorder(district, avg_income))) +
  geom_point() +
  labs(title = "Average Student Income by District",
       x = "Average Student Income",
       y = "District") +
  theme_minimal()

# Scatter plot: number of schools per district
plot_district_schools <- ggplot(district_summary, aes(x = num_schools, y = reorder(district, num_schools))) +
  geom_point() +
  labs(title = "Number of Schools by District",
       x = "Number of Schools",
       y = "District") +
  theme_minimal()

# Print district plots
print(plot_district_income)

print(plot_district_schools)

plots_CASchool_eq <- lapply(vars_CASchool, function(var) {
  p <- ggplot(CASchools, aes_string(x = "income", y = var)) +
    geom_point() +
    geom_smooth(method = "lm", se = TRUE) +
    labs(title = paste("Student Income vs", var),
         x = "income",
         y = var) +
    theme_minimal()
  
  # Calculate regression equation
  eq <- add_regression_eq(CASchools, "income", var)
  
  # Add equation to the plot
  p + annotate("text", x = Inf, y = Inf, label = eq, hjust = 1.1, vjust = 1.1, size = 5, color = "red")
})

Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
response will be ignored

Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

for (p in plots_CASchool_eq) print(p)

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

Based on the previous calculations (due to the 8-page limit, I won’t display all models here), we developed a model where the dependent variable y is the sum of various factors for each student, excluding “county”, “students”, “grade”, and “district.” Most variables are assigned a coefficient of 1, except for specific variables such as expenditure (coefficient 27.6), reading scores (1.94), math scores (1.82), lunch percentage (-2.57), number of teachers (1.12), CalWORKs participation (-0.813), and percentage of English learners (-0.778). The x-axis represents student income, allowing us to examine how closely the estimated y aligns with actual student income.

library(ggplot2)
library(dplyr)

# Assign coefficients for selected variables
coef_list <- c(expenditure = 27.6, read = 1.94, math = 1.82, lunch = -2.57,
               teachers = 1.12, calworks = -0.813, English = -0.778)

# Calculate the weighted sum for each student
CASchool <- CASchools %>%
  rowwise() %>%
  mutate(weighted_sum = expenditure * coef_list["expenditure"] +
                       read * coef_list["read"] +
                       math * coef_list["math"] +
                       lunch * coef_list["lunch"] +
                       teachers * coef_list["teachers"] +
                       calworks * coef_list["calworks"] +
                       english * coef_list["English"]) %>%
  ungroup()

# Create the scatter plot with regression line
plot <- ggplot(CASchool, aes(x = income, y = weighted_sum)) +
  geom_point() +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Est income vs Average Student Income",
       x = "Average Student Income",
       y = "Est income") +
  theme_minimal()

# Define the regression equation function
add_regression_eq <- function(data, xvar, yvar) {
  model <- try(lm(as.formula(paste(yvar, "~", xvar)), data = data, na.action = na.exclude), silent = TRUE)
  
  if (inherits(model, "try-error")) {
    return("N/A")
  }
  
  eq <- paste("y =", format(coef(model)[1], digits = 3), 
              "+", format(coef(model)[2], digits = 3), "x")
  return(eq)
}

# Add the regression equation to the plot
reg_eq <- add_regression_eq(CASchool, "income", "weighted_sum")

# Add annotation to the plot
plot + annotate("text", x = Inf, y = Inf, label = reg_eq, hjust = 1.1, vjust = 1.1, size = 5, color = "red")

`geom_smooth()` using formula = 'y ~ x'

Boston: Similarly, the Boston dataset provides information about housing in different suburbs of Boston at a particular point in time. Since it includes data from various suburbs without tracking them across multiple periods, it is also considered cross-sectional data.

# Set2: Scatterplot for Boston (number of rooms vs. median home value)
# List of variables to plot against medv
vars_Boston <- setdiff(names(Boston), "medv")

# Create scatter plots with regression lines for each variable
plots_Boston <- lapply(vars_Boston, function(var) {
  ggplot(Boston, aes_string(x = "medv", y = var)) +
    geom_point() +
    geom_smooth(method = "lm", se = TRUE) +
    labs(title = paste("Median Home Value vs", var),
         x = "Median Home Value",
         y = var) +
    theme_minimal()
})

# Print all plots
#for (p in plots_Boston) print(p)

plots_Boston_eq <- lapply(vars_Boston, function(var) {
  p <- ggplot(Boston, aes_string(x = "medv", y = var)) +
    geom_point() +
    geom_smooth(method = "lm", se = TRUE) +
    labs(title = paste("Median Home Value vs", var),
         x = "Median Home Value",
         y = var) +
    theme_minimal()
  eq <- add_regression_eq(Boston, "medv", var)
  p + annotate("text", x = Inf, y = Inf, label = eq, hjust = 1.1, vjust = 1.1, size = 5, color = "red")
})
for (p in plots_Boston_eq) print(p)

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

`geom_smooth()` using formula = 'y ~ x'

For brevity, I won’t display all regression models here, but the estimated home value (y) is calculated as the sum of the weighted averages of several variables with the following coefficients: crime rate (-0.363), residential land zoned (0.914), industrial areas (-0.361), Charles River dummy variable (0), nitrogen oxide concentration (-0.00538), average number of rooms (0.0531), age of homes (-1.15), distance to employment centers (0.0572), access to radial highways (-0.361), property tax rate (-8.59), pupil-teacher ratio (-0.12), proportion of African American residents (3.31), and percentage of lower status population (-0.573). The x-axis will represent the actual median home value, allowing us to compare the estimated home values to the real ones.

# Assign coefficients for selected variables
coef_list <- c(crim = -0.363, zn = 0.914, indus = -0.361, chas = 0, nox = -0.00538,
               rm = 0.0531, age = -1.15, dis = 0.0572, rad = -0.361, tax = -8.59, 
               ptratio = -0.12, black = 3.31, lstat = -0.573)

# Exclude 'medv' from the variables
vars_to_exclude <- "medv"
vars_Boston <- setdiff(names(Boston), vars_to_exclude)

# Calculate the weighted sum for each row
Boston <- Boston %>%
  rowwise() %>%
  mutate(weighted_sum = crim * coef_list["crim"] +
                       zn * coef_list["zn"] +
                       indus * coef_list["indus"] +
                       chas * coef_list["chas"] +
                       nox * coef_list["nox"] +
                       rm * coef_list["rm"] +
                       age * coef_list["age"] +
                       dis * coef_list["dis"] +
                       rad * coef_list["rad"] +
                       tax * coef_list["tax"] +
                       ptratio * coef_list["ptratio"] +
                       black * coef_list["black"] +
                       lstat * coef_list["lstat"])

# Create the scatter plot
#ggplot(Boston, aes(x = medv, y = weighted_sum)) +
#  geom_point(color = "blue") +
#  labs(title = "Weighted Sum of Variables vs Median Home Value",
#       x = "Median Home Value",
#       y = "Weighted Sum of Variables") +
#  theme_minimal() +
#  geom_smooth(method = "lm", color = "red", se = TRUE)  # Add a regression line

# Define the regression equation function
add_regression_eq <- function(data, xvar, yvar) {
  data_clean <- data %>% filter(!is.na(.data[[xvar]]) & !is.na(.data[[yvar]]))
  
  if (nrow(data_clean) < 2) {
    return("N/A")
  }
  
  model <- try(lm(as.formula(paste(yvar, "~", xvar)), data = data_clean), silent = TRUE)
  
  if (inherits(model, "try-error")) {
    return("N/A")
  }
  
  eq <- paste("y =", format(coef(model)[1], digits = 3), 
              "+", format(coef(model)[2], digits = 3), "x")
  return(eq)
}

# Add the equation to the plot
reg_eq <- add_regression_eq(Boston, "medv", "weighted_sum")

# Add annotation to the plot
ggplot(Boston, aes(x = medv, y = weighted_sum)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Weighted Sum of Variables vs Median Home Value",
       x = "Median Home Value",
       y = "Weighted Sum of Variables") +
  annotate("text", x = Inf, y = Inf, label = reg_eq, hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Summary for both sets In the CASchools dataset, different regions show varying income levels, and we can observe how various factors relate to student income. Factors with steeper slopes indicate a stronger influence on student income, highlighting their significance. Similarly, we can apply the same assumptions to the Boston dataset, analyzing how different factors relate to housing values. Variables with steeper slopes suggest a stronger impact on housing prices.