# Check the structure to verify variable names#str(CASchools)#str(Boston)
Describe Data
Set1: CASchools (from the AER package) contains cross-sectional data on 420 California schools, with variables like student-teacher ratio, average test scores, and per-student expenditure. These variables allow people to explore how resource allocation impacts student outcomes.
Set2: Boston (from the MASS package) contains data on housing values in suburbs of Boston, with key variables like the crime rate, average number of rooms, and the proportion of the population considered lower status. This data helps study the factors affecting housing prices.
Type of Data
#note we can use visdat to vidualize data#install.packages("visdat")#library(visdat)#vis_dat(CASchools)#vis_dat(Boston)# Function to calculate regression line and add equation to plotadd_regression_eq <-function(data, xvar, yvar) {# Remove rows with NA in either x or y variable data_clean <- data %>% dplyr::filter(!is.na(.data[[xvar]]) &!is.na(.data[[yvar]]))# Check if there is enough data to fit a modelif (nrow(data_clean) <2) {return("N/A") }# Fit the linear model model <-try(lm(as.formula(paste(yvar, "~", xvar)), data = data_clean), silent =TRUE)# If the model fails, return "N/A"if (inherits(model, "try-error")) {return("N/A") }# Extract coefficients and format the equation eq <-paste("y =", format(coef(model)[1], digits =3), "+", format(coef(model)[2], digits =3), "x")return(eq)}
CASchools: This dataset contains observations on different schools (420 California schools) at a single point in time, making it cross-sectional. Each row represents data from a different school, and there is no tracking of these schools over time.
# Set1: Scatterplot for CASchools (expenditure per student vs. test score)# List of variables to plot against student_income# Exclude "county", "school", and "district" from the variables for individual scatter plotsvars_CASchool <-setdiff(names(CASchools), c("income", "county", "school", "district"))# Create scatter plots with regression lines for each variableplots_CASchool <-lapply(vars_CASchool, function(var) {ggplot(CASchools, aes_string(x ="income", y = var)) +geom_point() +geom_smooth(method ="lm", se =TRUE) +labs(title =paste("Student Income vs", var),x ="income",y = var) +theme_minimal()})
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
# Print all plots#for (p in plots_CASchool) print(p)# Group data by county and calculate the average student income and the number of schools per countycounty_summary <- CASchools %>%group_by(county) %>%summarise(avg_income =mean(income, na.rm =TRUE),num_schools =n())# Scatter plot: average student income per countyplot_county_income <-ggplot(county_summary, aes(x = avg_income, y =reorder(county, avg_income))) +geom_point() +labs(title ="Average Student Income by County",x ="Average Student Income",y ="County") +theme_minimal()# Scatter plot: number of schools per countyplot_county_schools <-ggplot(county_summary, aes(x = num_schools, y =reorder(county, num_schools))) +geom_point() +labs(title ="Number of Schools by County",x ="Number of Schools",y ="County") +theme_minimal()# Print county plotsprint(plot_county_income)
print(plot_county_schools)
# Group data by district and calculate the average student income and the number of schools per districtdistrict_summary <- CASchools %>%group_by(district) %>%summarise(avg_income =mean(income, na.rm =TRUE),num_schools =n())# Scatter plot: average student income per districtplot_district_income <-ggplot(district_summary, aes(x = avg_income, y =reorder(district, avg_income))) +geom_point() +labs(title ="Average Student Income by District",x ="Average Student Income",y ="District") +theme_minimal()# Scatter plot: number of schools per districtplot_district_schools <-ggplot(district_summary, aes(x = num_schools, y =reorder(district, num_schools))) +geom_point() +labs(title ="Number of Schools by District",x ="Number of Schools",y ="District") +theme_minimal()# Print district plotsprint(plot_district_income)
print(plot_district_schools)
plots_CASchool_eq <-lapply(vars_CASchool, function(var) { p <-ggplot(CASchools, aes_string(x ="income", y = var)) +geom_point() +geom_smooth(method ="lm", se =TRUE) +labs(title =paste("Student Income vs", var),x ="income",y = var) +theme_minimal()# Calculate regression equation eq <-add_regression_eq(CASchools, "income", var)# Add equation to the plot p +annotate("text", x =Inf, y =Inf, label = eq, hjust =1.1, vjust =1.1, size =5, color ="red")})
Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
response will be ignored
Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
for (p in plots_CASchool_eq) print(p)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
Based on the previous calculations (due to the 8-page limit, I won’t display all models here), we developed a model where the dependent variable y is the sum of various factors for each student, excluding “county”, “students”, “grade”, and “district.” Most variables are assigned a coefficient of 1, except for specific variables such as expenditure (coefficient 27.6), reading scores (1.94), math scores (1.82), lunch percentage (-2.57), number of teachers (1.12), CalWORKs participation (-0.813), and percentage of English learners (-0.778). The x-axis represents student income, allowing us to examine how closely the estimated y aligns with actual student income.
library(ggplot2)library(dplyr)# Assign coefficients for selected variablescoef_list <-c(expenditure =27.6, read =1.94, math =1.82, lunch =-2.57,teachers =1.12, calworks =-0.813, English =-0.778)# Calculate the weighted sum for each studentCASchool <- CASchools %>%rowwise() %>%mutate(weighted_sum = expenditure * coef_list["expenditure"] + read * coef_list["read"] + math * coef_list["math"] + lunch * coef_list["lunch"] + teachers * coef_list["teachers"] + calworks * coef_list["calworks"] + english * coef_list["English"]) %>%ungroup()# Create the scatter plot with regression lineplot <-ggplot(CASchool, aes(x = income, y = weighted_sum)) +geom_point() +geom_smooth(method ="lm", color ="red", se =TRUE) +labs(title ="Est income vs Average Student Income",x ="Average Student Income",y ="Est income") +theme_minimal()# Define the regression equation functionadd_regression_eq <-function(data, xvar, yvar) { model <-try(lm(as.formula(paste(yvar, "~", xvar)), data = data, na.action = na.exclude), silent =TRUE)if (inherits(model, "try-error")) {return("N/A") } eq <-paste("y =", format(coef(model)[1], digits =3), "+", format(coef(model)[2], digits =3), "x")return(eq)}# Add the regression equation to the plotreg_eq <-add_regression_eq(CASchool, "income", "weighted_sum")# Add annotation to the plotplot +annotate("text", x =Inf, y =Inf, label = reg_eq, hjust =1.1, vjust =1.1, size =5, color ="red")
`geom_smooth()` using formula = 'y ~ x'
Boston: Similarly, the Boston dataset provides information about housing in different suburbs of Boston at a particular point in time. Since it includes data from various suburbs without tracking them across multiple periods, it is also considered cross-sectional data.
# Set2: Scatterplot for Boston (number of rooms vs. median home value)# List of variables to plot against medvvars_Boston <-setdiff(names(Boston), "medv")# Create scatter plots with regression lines for each variableplots_Boston <-lapply(vars_Boston, function(var) {ggplot(Boston, aes_string(x ="medv", y = var)) +geom_point() +geom_smooth(method ="lm", se =TRUE) +labs(title =paste("Median Home Value vs", var),x ="Median Home Value",y = var) +theme_minimal()})# Print all plots#for (p in plots_Boston) print(p)plots_Boston_eq <-lapply(vars_Boston, function(var) { p <-ggplot(Boston, aes_string(x ="medv", y = var)) +geom_point() +geom_smooth(method ="lm", se =TRUE) +labs(title =paste("Median Home Value vs", var),x ="Median Home Value",y = var) +theme_minimal() eq <-add_regression_eq(Boston, "medv", var) p +annotate("text", x =Inf, y =Inf, label = eq, hjust =1.1, vjust =1.1, size =5, color ="red")})for (p in plots_Boston_eq) print(p)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
For brevity, I won’t display all regression models here, but the estimated home value (y) is calculated as the sum of the weighted averages of several variables with the following coefficients: crime rate (-0.363), residential land zoned (0.914), industrial areas (-0.361), Charles River dummy variable (0), nitrogen oxide concentration (-0.00538), average number of rooms (0.0531), age of homes (-1.15), distance to employment centers (0.0572), access to radial highways (-0.361), property tax rate (-8.59), pupil-teacher ratio (-0.12), proportion of African American residents (3.31), and percentage of lower status population (-0.573). The x-axis will represent the actual median home value, allowing us to compare the estimated home values to the real ones.
# Assign coefficients for selected variablescoef_list <-c(crim =-0.363, zn =0.914, indus =-0.361, chas =0, nox =-0.00538,rm =0.0531, age =-1.15, dis =0.0572, rad =-0.361, tax =-8.59, ptratio =-0.12, black =3.31, lstat =-0.573)# Exclude 'medv' from the variablesvars_to_exclude <-"medv"vars_Boston <-setdiff(names(Boston), vars_to_exclude)# Calculate the weighted sum for each rowBoston <- Boston %>%rowwise() %>%mutate(weighted_sum = crim * coef_list["crim"] + zn * coef_list["zn"] + indus * coef_list["indus"] + chas * coef_list["chas"] + nox * coef_list["nox"] + rm * coef_list["rm"] + age * coef_list["age"] + dis * coef_list["dis"] + rad * coef_list["rad"] + tax * coef_list["tax"] + ptratio * coef_list["ptratio"] + black * coef_list["black"] + lstat * coef_list["lstat"])# Create the scatter plot#ggplot(Boston, aes(x = medv, y = weighted_sum)) +# geom_point(color = "blue") +# labs(title = "Weighted Sum of Variables vs Median Home Value",# x = "Median Home Value",# y = "Weighted Sum of Variables") +# theme_minimal() +# geom_smooth(method = "lm", color = "red", se = TRUE) # Add a regression line# Define the regression equation functionadd_regression_eq <-function(data, xvar, yvar) { data_clean <- data %>%filter(!is.na(.data[[xvar]]) &!is.na(.data[[yvar]]))if (nrow(data_clean) <2) {return("N/A") } model <-try(lm(as.formula(paste(yvar, "~", xvar)), data = data_clean), silent =TRUE)if (inherits(model, "try-error")) {return("N/A") } eq <-paste("y =", format(coef(model)[1], digits =3), "+", format(coef(model)[2], digits =3), "x")return(eq)}# Add the equation to the plotreg_eq <-add_regression_eq(Boston, "medv", "weighted_sum")# Add annotation to the plotggplot(Boston, aes(x = medv, y = weighted_sum)) +geom_point(color ="blue") +geom_smooth(method ="lm", color ="red", se =TRUE) +labs(title ="Weighted Sum of Variables vs Median Home Value",x ="Median Home Value",y ="Weighted Sum of Variables") +annotate("text", x =Inf, y =Inf, label = reg_eq, hjust =1.1, vjust =1.1, size =5, color ="red") +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
Summary for both sets In the CASchools dataset, different regions show varying income levels, and we can observe how various factors relate to student income. Factors with steeper slopes indicate a stronger influence on student income, highlighting their significance. Similarly, we can apply the same assumptions to the Boston dataset, analyzing how different factors relate to housing values. Variables with steeper slopes suggest a stronger impact on housing prices.