Linear Models (Regression)

# Load necessary libraries
library(knitr)
library(kableExtra)

# Define the table data
lm_parameters <- data.frame(
  Parameter = c(
    "formula", "data", "subset", "weights", "na.action", "method",
    "model", "x", "y", "qr", "singular.ok", "contrasts", "offset", "..."
  ),
  Meaning = c(
    "Defines the relationship between response and predictors in Wilkinson-Rogers notation.",
    "Specifies the data frame containing the response and predictor variables.",
    "Selects a subset of observations to use, defined by logical statements.",
    "Assigns analytical weights to observations.",
    "Handles missing values (NAs) according to a specified action.",
    "Determines the method for fitting the model: 'qr' or 'model.frame'.",
    "Stores the model frame in the fitted object.",
    "Stores the model matrix in the fitted object.",
    "Stores the response variable in the fitted object.",
    "Stores the QR decomposition in the fitted object.",
    "Allows fitting even when predictors are collinear, setting some coefficients to NA.",
    "Defines contrasts for factor variables in the model.",
    "Includes a priori known components in the model.",
    "Passes additional arguments to lower-level fitting functions."
  )
)

# Create a colorful table
lm_parameters %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(1:14, background = c("lightcyan", "lavender", "lightpink", "lightyellow", "lightgreen", "lightblue", 
                                "lightcoral", "lightgray", "lightgoldenrod", "lightsteelblue", "lightgold", 
                                "lightsalmon", "lightseagreen", "lightpurple")) %>%
  column_spec(1, bold = TRUE, background = "lightgoldenrodyellow") %>%
  column_spec(2, background = "lightcyan")

Parameter	Meaning
formula	Defines the relationship between response and predictors in Wilkinson-Rogers notation.
data	Specifies the data frame containing the response and predictor variables.
subset	Selects a subset of observations to use, defined by logical statements.
weights	Assigns analytical weights to observations.
na.action	Handles missing values (NAs) according to a specified action.
method	Determines the method for fitting the model: ‘qr’ or ‘model.frame’.
model	Stores the model frame in the fitted object.
x	Stores the model matrix in the fitted object.
y	Stores the response variable in the fitted object.
qr	Stores the QR decomposition in the fitted object.
singular.ok	Allows fitting even when predictors are collinear, setting some coefficients to NA.
contrasts	Defines contrasts for factor variables in the model.
offset	Includes a priori known components in the model.
…	Passes additional arguments to lower-level fitting functions.

# Ensure the dataset is loaded
house_data <- data.frame(
  size = c(1200, 1500, 1800, 2000, 2200, 2500, 2700, 3000, 3200, 3500),
  price = c(250000, 275000, 300000, 320000, 340000, 370000, 390000, 420000, 440000, 470000)
)

# Plot the relationship between size and price
plot(price ~ size, data = house_data, col = 2, main = "House Price vs Size", 
     xlab = "Size (sq ft)", ylab = "Price ($)")

# Fit the linear model
fit <- lm(price ~ size, data = house_data)

# Add the regression line to the plot
abline(fit, col = 3, lwd = 2)

# Load or create your dataset
house_data <- data.frame(
  size = c(1200, 1500, 1800, 2000, 2200, 2500, 2700, 3000, 3200, 3500),
  price = c(250000, 275000, 300000, 320000, 340000, 370000, 390000, 420000, 440000, 470000)
)

# Plot the relationship between size and price
plot(price ~ size, data = house_data, col = 2, main = "House Price vs Size", 
     xlab = "Size (sq ft)", ylab = "Price ($)")

# Fit the linear model
fit <- lm(price ~ size, data = house_data)

# Add the regression line to the plot
abline(fit, col = 3, lwd = 2)

# Prepare the regression equation text
bs <- round(coef(fit), 2)
lmlab <- paste0("price = ", bs[1], ifelse(sign(bs[2]) == 1, " + ", " - "), abs(bs[2]), " size ")

# Add the regression equation to the plot
mtext(lmlab, 3, line = -2)

# I built a linear model to predict house prices based on size (in square feet).
house_mdl <- lm(price ~ size, data = house_data)

# Here's a quick look at the model I built.
house_mdl

## 
## Call:
## lm(formula = price ~ size, data = house_data)
## 
## Coefficients:
## (Intercept)         size  
##   128851.88        96.88

# I have a new dataset with house sizes and want to predict the prices for these sizes.
set.seed(1234) # I set the seed for reproducibility.
new_sizes <- sample(house_data$size, 5) # I randomly selected 5 house sizes from my data.
new_sizes

## [1] 3500 2500 2200 2000 1200

# To make predictions, I need to create a new data frame with the same column name 'size' as in the original data.
new_house_df <- data.frame(size = new_sizes)

# Now, I use my model to predict house prices for these new sizes.
predicted_prices <- predict(house_mdl, new_house_df)
predicted_prices

##        1        2        3        4        5 
## 467948.7 371063.9 341998.4 322621.5 245113.6

# It's essential to match the column name in the new data frame with the original data. 
# Let's say I made a mistake and didn't use a data frame:
#predict(house_mdl, new_sizes) # This would give me an error!
#Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : 
#'data' must be a data.frame, environment, or list

# Similarly, if I didn't use the right column name in my new data frame:
#wrong_house_df <- data.frame(new_sizes)
#predict(house_mdl, wrong_house_df) # This would also result in an error!
#Error in eval(predvars, data, env) : object 'size' not found

# To check the accuracy of my predictions, I'll need the actual house prices.
# Here's a new data frame with both size and actual prices for comparison.
comparison_df <- data.frame(price = house_data$price[1:10], size = house_data$size[1:10])

# Using my model, I predict the prices for these sizes.
predicted_comparison <- predict(house_mdl, comparison_df)

# Finally, I calculate the root mean square error (RMSE) to evaluate my model's prediction accuracy.
rmse <- sqrt(mean((predicted_comparison - comparison_df$price)^2, na.rm = TRUE))
rmse

## [1] 2301.483

# I created a dataset to study house prices influenced by location, house type, and age of the house.
house_data <- data.frame(
  price = c(300000, 450000, 350000, 400000, 320000),
  urban = c(1, 1, 0, 1, 0),  # Whether the house is in an urban area
  type = c(1, 0, 1, 0, 1),   # Type of house: 1 for single-family, 0 for others
  age = c(10, 5, 20, 15, 8), # Age of the house
  weight = c(1.1, 0.9, 1.2, 1.0, 0.8) # I assigned different weights to reflect data confidence
)

# I wanted to analyze house prices while giving more weight to certain observations.
# So, I used analytic weights to account for variations in data precision.
house_lm <- lm(price ~ urban + type + age, data = house_data, weights = weight)
summary(house_lm)

## 
## Call:
## lm(formula = price ~ urban + type + age, data = house_data, weights = weight)
## 
## Weighted Residuals:
##          1          2          3          4          5 
##  8.185e-12  2.222e+04  1.604e+04 -2.108e+04 -1.964e+04 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  470185.4    79404.8   5.921    0.107
## urban        -40858.8    50938.9  -0.802    0.570
## type        -123828.9    47647.9  -2.599    0.234
## age            -549.8     3684.0  -0.149    0.906
## 
## Residual standard error: 39760 on 1 degrees of freedom
## Multiple R-squared:  0.8889, Adjusted R-squared:  0.5555 
## F-statistic: 2.666 on 3 and 1 DF,  p-value: 0.4165

# Output explained:
# I see the intercept and how urban, type, and age contribute to predicting house prices.
# The weights I used help give more emphasis to certain data points, improving my model's relevance.

# Now, I want to consider sampling weights for a more balanced analysis based on population differences.
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

# I added a unique identifier to each house entry because I want to use this for the survey design.
house_data$ID <- 1:nrow(house_data)

# I created a survey design object using the unique IDs and my weights.
# This setup lets me run a survey-weighted regression for better population-level inference.
survey_design <- svydesign(id = ~ID, weights = ~weight, data = house_data)

# I ran a generalized linear model on the survey design to see how urban, type, and age influence prices.
survey_lm <- svyglm(price ~ urban + type + age, design = survey_design)
summary(survey_lm)

## 
## Call:
## svyglm(formula = price ~ urban + type + age, design = survey_design)
## 
## Survey design:
## svydesign(id = ~ID, weights = ~weight, data = house_data)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  470185.4    40792.7  11.526   0.0551 .
## urban        -40858.8    18634.6  -2.193   0.2724  
## type        -123828.9    17570.4  -7.048   0.0897 .
## age            -549.8     2044.7  -0.269   0.8328  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 395250488)
## 
## Number of Fisher Scoring iterations: 2

# Output explained:
# The model now takes into account sampling weights, offering insights that are adjusted for population differences.
# I can see how each factor affects the house prices, along with standard errors reflecting the weighted setup.

# I wanted to explore non-linearity in a dataset about student test scores based on study hours and sleep hours.
# I created a dataset to simulate this scenario.
student_data <- data.frame(
  score = c(55, 65, 75, 85, 95, 70, 60, 80, 90, 100),
  study_hours = c(2, 3, 4, 5, 6, 3, 2.5, 4.5, 5.5, 6.5),
  sleep_hours = c(6, 7, 5, 8, 6, 7, 6.5, 5.5, 7.5, 8)
)

# First, I plotted the variables to visualize any potential relationships.
plot(student_data[, c("score", "study_hours", "sleep_hours")])

# The relationship between score and study hours looked non-linear to me.
# I started with a simple linear model to see how study_hours and sleep_hours affect the score.
fit_linear <- lm(score ~ study_hours + sleep_hours, data = student_data)
summary(fit_linear)

## 
## Call:
## lm(formula = score ~ study_hours + sleep_hours, data = student_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0661 -0.9707 -0.2561  0.0407  3.9339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.5484     3.6618   9.435 3.14e-05 ***
## study_hours   9.6369     0.3843  25.077 4.09e-08 ***
## sleep_hours   0.3724     0.5787   0.644     0.54    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.676 on 7 degrees of freedom
## Multiple R-squared:  0.9905, Adjusted R-squared:  0.9877 
## F-statistic: 363.5 on 2 and 7 DF,  p-value: 8.47e-08

# The initial fit showed some results, but sleep_hours didn't seem very significant.
# I decided to fit a quadratic model by adding a square term for study_hours to check for non-linearity.
fit_quadratic <- lm(score ~ study_hours + sleep_hours + I(study_hours^2), data = student_data)
summary(fit_quadratic)

## 
## Call:
## lm(formula = score ~ study_hours + sleep_hours + I(study_hours^2), 
##     data = student_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2190 -0.8501 -0.3437  0.1417  3.7810 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       31.5076     7.2908   4.322  0.00497 **
## study_hours       10.9998     2.7971   3.933  0.00769 **
## sleep_hours        0.4541     0.6348   0.715  0.50131   
## I(study_hours^2)  -0.1629     0.3308  -0.493  0.63986   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.775 on 6 degrees of freedom
## Multiple R-squared:  0.9908, Adjusted R-squared:  0.9863 
## F-statistic: 216.2 on 3 and 6 DF,  p-value: 1.679e-06

# The quadratic model seemed better. The R^2 value increased, and all variables became more significant.
# The fitted model equation is:
# score = (Intercept) + study_hours * (coefficient) + sleep_hours * (coefficient) + study_hours^2 * (coefficient)

# Another way I could specify the polynomial regression is by using the poly function.
# I used poly with raw=TRUE to get the same result but in a slightly different way.
summary(lm(score ~ sleep_hours + poly(study_hours, 2, raw = TRUE), data = student_data))

## 
## Call:
## lm(formula = score ~ sleep_hours + poly(study_hours, 2, raw = TRUE), 
##     data = student_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2190 -0.8501 -0.3437  0.1417  3.7810 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                        31.5076     7.2908   4.322  0.00497 **
## sleep_hours                         0.4541     0.6348   0.715  0.50131   
## poly(study_hours, 2, raw = TRUE)1  10.9998     2.7971   3.933  0.00769 **
## poly(study_hours, 2, raw = TRUE)2  -0.1629     0.3308  -0.493  0.63986   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.775 on 6 degrees of freedom
## Multiple R-squared:  0.9908, Adjusted R-squared:  0.9863 
## F-statistic: 216.2 on 3 and 6 DF,  p-value: 1.679e-06

# I was curious about how to visualize this 3D relationship. I explored using the p3d package.
# I installed and loaded p3d to plot a 3D surface.
#install.packages("plot3D") # Install if you haven't already
library(plot3D)
# Using this approach, I could visualize the relationship between the variables in a more detailed and interactive way.

# Assuming the dataset `student_data` is already created
student_data <- data.frame(
  score = c(55, 65, 75, 85, 95, 70, 60, 80, 90, 100),
  study_hours = c(2, 3, 4, 5, 6, 3, 2.5, 4.5, 5.5, 6.5),
  sleep_hours = c(6, 7, 5, 8, 6, 7, 6.5, 5.5, 7.5, 8)
)

# Fit a quadratic model
fit_quadratic <- lm(score ~ study_hours + sleep_hours + I(study_hours^2), data = student_data)

# Create a grid of values for plotting the surface
study_seq <- seq(min(student_data$study_hours), max(student_data$study_hours), length.out = 50)
sleep_seq <- seq(min(student_data$sleep_hours), max(student_data$sleep_hours), length.out = 50)
grid <- expand.grid(study_hours = study_seq, sleep_hours = sleep_seq)

# Predict scores using the quadratic model on the grid
grid$score <- predict(fit_quadratic, newdata = grid)

# 3D scatter plot with the actual data points
scatter3D(
  x = student_data$study_hours, 
  y = student_data$sleep_hours, 
  z = student_data$score, 
  pch = 19, colkey = FALSE, main = "3D Plot: Score ~ Study & Sleep Hours",
  xlab = "Study Hours", ylab = "Sleep Hours", zlab = "Score"
)

# library
#install.packages("plot3D") # Install if not already done
library(plot3D)

# Dataset 
student_data <- data.frame(
  score = c(55, 65, 75, 85, 95, 70, 60, 80, 90, 100),
  study_hours = c(2, 3, 4, 5, 6, 3, 2.5, 4.5, 5.5, 6.5),
  sleep_hours = c(6, 7, 5, 8, 6, 7, 6.5, 5.5, 7.5, 8)
)

# Fitting a quadratic model
fit_quadratic <- lm(score ~ study_hours + sleep_hours + I(study_hours^2), data = student_data)

# I created sequences for study_hours and sleep_hours
study_seq <- seq(min(student_data$study_hours), max(student_data$study_hours), length.out = 50)
sleep_seq <- seq(min(student_data$sleep_hours), max(student_data$sleep_hours), length.out = 50)

# Then I create a grid for the sequences
grid <- expand.grid(study_hours = study_seq, sleep_hours = sleep_seq)

# Next I predicted the scores for the grid
grid$score <- predict(fit_quadratic, newdata = grid)

# Then I converted the grid to matrices
study_matrix <- matrix(grid$study_hours, nrow = 50, ncol = 50)
sleep_matrix <- matrix(grid$sleep_hours, nrow = 50, ncol = 50)
score_matrix <- matrix(grid$score, nrow = 50, ncol = 50)

# 3D scatter plot 
scatter3D(
  x = student_data$study_hours, 
  y = student_data$sleep_hours, 
  z = student_data$score, 
  pch = 19, colkey = FALSE, main = "3D Plot: Score ~ Study & Sleep Hours",
  xlab = "Study Hours", ylab = "Sleep Hours", zlab = "Score"
)

# Lets add a fitted surface
surf3D(
  x = study_matrix, 
  y = sleep_matrix, 
  z = score_matrix, 
  add = TRUE, col = "lightblue", alpha = 0.5
)

# I decided to explore the relationship between student test scores and their study hours.
# First, I created a simple linear model to predict scores based on study hours.
fit <- lm(score ~ study_hours, data = student_data)

# To visualize the relationship, I plotted study hours against scores.
# I used a scatter plot to see how the data points are distributed.
plot(student_data$study_hours, student_data$score, pch = 18, 
     xlab = 'Study Hours', ylab = 'Test Score')

# I wanted to add the regression line to the plot, showing the predicted relationship.
# The line spans from the minimum to the maximum study hours in my dataset.
lines(
  c(min(student_data$study_hours), max(student_data$study_hours)),
  as.numeric(predict(fit, data.frame(study_hours = c(min(student_data$study_hours), max(student_data$study_hours)))))
)

# To make the plot more informative, I decided to add the regression equation, R-squared value, and Pearson correlation coefficient.
# I used the vector function to prepare these values for the legend.

# First, I set up a vector to hold the expressions I wanted to display.
rp = vector('expression', 3)

# The first entry is the regression equation, showing the intercept and slope.
rp[1] = substitute(expression(italic(y) == MYOTHERVALUE3 + MYOTHERVALUE4 %*% x),
                   list(MYOTHERVALUE3 = format(fit$coefficients[1], digits = 2),
                        MYOTHERVALUE4 = format(fit$coefficients[2], digits = 2)))[2]

# Next, I added the adjusted R-squared value, which tells me how well the model fits the data.
rp[2] = substitute(expression(italic(R)^2 == MYVALUE),
                   list(MYVALUE = format(summary(fit)$adj.r.squared, dig = 3)))[2]

# Finally, I included the Pearson correlation coefficient, giving me a measure of linear correlation between study hours and scores.
rp[3] = substitute(expression(Pearson-R == MYOTHERVALUE2),
                   list(MYOTHERVALUE2 = format(cor(student_data$study_hours, student_data$score), digits = 2)))[2]

# I placed the legend on the top right corner of the plot, without a box around it, to keep the plot clean.
legend("topright", legend = rp, bty = 'n')

# This plot now gives a clear visual representation of the relationship between study hours and test scores.
# It includes the regression line, equation, R-squared value, and correlation coefficient.

# After I built my regression model, I wanted to assess its quality to ensure it was a good fit for my data.
# I knew the importance of examining the residuals and other diagnostic plots to validate the assumptions of my model.

# First, I fit the linear model to see how test scores are affected by study hours.
fit <- lm(score ~ study_hours, data = student_data)

# To get a comprehensive view, I decided to create multiple diagnostic plots.
# I set up the plotting area to display two plots, one above the other.
par(mfrow = c(2, 1))

# I then plotted the diagnostic plots for my model to evaluate its performance.
# The first plot checks for linearity in the relationship between study hours and scores.
# The second plot examines if the residuals are normally distributed.
plot(fit, which = 1:2)

# Here's what I was looking for in these plots:
# 1. I wanted to confirm that the relationship between study hours and scores is linear.
#    This means the residuals should be evenly distributed around the horizontal line at zero.
#    If the residuals are not centered, it might suggest a non-linear relationship.
#    I noticed that residuals were positive at the extremes and negative in the middle, hinting at some non-linearity.
#
# 2. I also checked if the residuals followed a normal distribution.
#    For normally distributed residuals, the points in the Q-Q plot should align closely with the diagonal line.
#    I observed some deviation at the ends, indicating slight skewness in the residuals.

Linear Models (Regression)

Avery Holloman

2024-11-04