library(faraway)
data <- (teengamb)
head(data)
## sex status income verbal gamble
## 1 1 51 2.00 8 0.0
## 2 1 28 2.50 8 0.0
## 3 1 37 2.00 6 0.0
## 4 1 28 7.00 4 7.3
## 5 1 65 2.00 8 19.6
## 6 1 61 3.47 6 0.1
(a) What percentage of variation in the response is explained by these predictors?
# Assuming 'teengamb' is your dataset
# Fit a regression model
model <- lm(gamble ~ sex + status + income + verbal, data = teengamb)
# Print the summary of the model
summary(model)
##
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sex -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
# Extract and print R-squared
r_squared <- summary(model)$r.squared
cat("Percentage of variation explained by predictors:", round(r_squared * 100, 2), "%\n")
## Percentage of variation explained by predictors: 52.67 %
(b) Which observation has the largest (positive) residual? Give the case number.
# Extract residuals
residuals <- residuals(model)
# Find the index of the observation with the largest positive residual
max_residual_index <- which.max(residuals)
# Print the case number
cat("Observation with the largest positive residual (case number):", max_residual_index, "\n")
## Observation with the largest positive residual (case number): 24
(c) Compute the mean and median of the residuals.
# Compute mean and median of residuals
mean_residual <- mean(residuals)
median_residual <- median(residuals)
# Print the results
cat("Mean of residuals:", mean_residual, "\n")
## Mean of residuals: -3.065293e-17
cat("Median of residuals:", median_residual, "\n")
## Median of residuals: -1.451392
(d) Compute the correlation of the residuals with the fitted values.
# Extract residuals and fitted values
residuals <- residuals(model)
fitted_values <- fitted(model)
# Compute the correlation
correlation_resid_fitted <- cor(residuals, fitted_values)
# Print the result
cat("Correlation of residuals with fitted values:", correlation_resid_fitted, "\n")
## Correlation of residuals with fitted values: -1.070659e-16
(e) Compute the correlation of the residuals with the income.
# Extract residuals and income
residuals <- residuals(model)
income <- teengamb$income
# Compute the correlation
correlation_resid_income <- cor(residuals, income)
# Print the result
cat("Correlation of residuals with income:", correlation_resid_income, "\n")
## Correlation of residuals with income: -7.242382e-17
(f) For all other predictors held constant, what would be the difference in predicted expenditure on gambling for a male compared to a female?
# Extract coefficients
coefficients <- coef(model)
# Find the coefficient for 'sex'
coefficient_sex <- coefficients['sex']
# Print the difference in predicted expenditure for male compared to female
cat("Difference in predicted expenditure for male compared to female:", coefficient_sex, "\n")
## Difference in predicted expenditure for male compared to female: -22.11833
uswages_1 <-(uswages)
head(uswages)
## wage educ exper race smsa ne mw so we pt
## 6085 771.60 18 18 0 1 1 0 0 0 0
## 23701 617.28 15 20 0 1 0 0 0 1 0
## 16208 957.83 16 9 0 1 0 0 1 0 0
## 2720 617.28 12 24 0 1 1 0 0 0 0
## 9723 902.18 14 12 0 1 0 1 0 0 0
## 22239 299.15 12 33 0 1 0 0 0 1 0
# Fit a regression model with weekly wages as the response
model_wages <- lm(wage ~ educ + exper, data = uswages_1)
# Print the summary of the model
summary(model_wages)
##
## Call:
## lm(formula = wage ~ educ + exper, data = uswages_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1018.2 -237.9 -50.9 149.9 7228.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -242.7994 50.6816 -4.791 1.78e-06 ***
## educ 51.1753 3.3419 15.313 < 2e-16 ***
## exper 9.7748 0.7506 13.023 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 427.9 on 1997 degrees of freedom
## Multiple R-squared: 0.1351, Adjusted R-squared: 0.1343
## F-statistic: 156 on 2 and 1997 DF, p-value: < 2.2e-16
we’ll fit the same model but with logged weekly wages and interpret the regression coefficient for years of education.
# Fit a regression model with logged weekly wages as the response
model_logged_wages <- lm(log(wage) ~ educ + exper, data = uswages_1)
# Print the summary of the model
summary(model_logged_wages)
##
## Call:
## lm(formula = log(wage) ~ educ + exper, data = uswages_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7533 -0.3495 0.1068 0.4381 3.5699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.650319 0.078354 59.35 <2e-16 ***
## educ 0.090506 0.005167 17.52 <2e-16 ***
## exper 0.018079 0.001160 15.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6615 on 1997 degrees of freedom
## Multiple R-squared: 0.1749, Adjusted R-squared: 0.174
## F-statistic: 211.6 on 2 and 1997 DF, p-value: < 2.2e-16
# Interpretation of the coefficient for 'educ' in the logged model
coefficient_logged = coef(model_logged_wages)["educ"]
percentage_change = (exp(coefficient_logged) - 1) * 100
cat("Percentage change in weekly wages for a one-unit increase in years of education:", percentage_change, "%\n")
## Percentage change in weekly wages for a one-unit increase in years of education: 9.472838 %
1. The dataset wbca comes from a study of breast cancer in Wisconsin. There are 681 cases of potentially cancerous tumors of which 238 are actually malignant. Determining whether a tumor is really malignant is traditionally determined by an invasive surgical procedure. The purpose of this study was to determine whether a new procedure called fine needle aspiration, which draws only a small sample of tissue, could be effective in determining tumor status.
plot(Class ~ BNucl, wbca)
# ii. Create a factor version of the response and produce boxplots
wbca$ClassFactor <- factor(wbca$Class, levels = c(0, 1), labels = c("Benign", "Malignant"))
boxplot(BNucl ~ ClassFactor, data = wbca, xlab = "Class", ylab = "BNucl", main = "Boxplot of BNucl by Class")
# iv. Produce a version of the interleaved histogram
par(mfrow = c(1, 2))
hist(wbca$BNucl[wbca$ClassFactor == "Benign"], col = "lightblue", main = "Benign Tumors", xlab = "BNucl")
hist(wbca$BNucl[wbca$ClassFactor == "Malignant"], col = "salmon", main = "Malignant Tumors", xlab = "BNucl")
# c. Fit a binary regression with Class as the response and the other nine variables as predictors
binary_model <- glm(Class ~ Adhes + BNucl + Chrom + Epith + Mitos + NNucl + Thick + UShap + USize, data = wbca, family = binomial)
summary(binary_model)
##
## Call:
## glm(formula = Class ~ Adhes + BNucl + Chrom + Epith + Mitos +
## NNucl + Thick + UShap + USize, family = binomial, data = wbca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.48282 -0.01179 0.04739 0.09678 3.06425
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.16678 1.41491 7.892 2.97e-15 ***
## Adhes -0.39681 0.13384 -2.965 0.00303 **
## BNucl -0.41478 0.10230 -4.055 5.02e-05 ***
## Chrom -0.56456 0.18728 -3.014 0.00257 **
## Epith -0.06440 0.16595 -0.388 0.69795
## Mitos -0.65713 0.36764 -1.787 0.07387 .
## NNucl -0.28659 0.12620 -2.271 0.02315 *
## Thick -0.62675 0.15890 -3.944 8.01e-05 ***
## UShap -0.28011 0.25235 -1.110 0.26699
## USize 0.05718 0.23271 0.246 0.80589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 881.388 on 680 degrees of freedom
## Residual deviance: 89.464 on 671 degrees of freedom
## AIC: 109.46
##
## Number of Fisher Scoring iterations: 8
# d. Use AIC for variable selection
selected_model <- step(binary_model, direction = "both", trace = FALSE)
summary(selected_model)
##
## Call:
## glm(formula = Class ~ Adhes + BNucl + Chrom + Mitos + NNucl +
## Thick + UShap, family = binomial, data = wbca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.44161 -0.01119 0.04962 0.09741 3.08205
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.0333 1.3632 8.094 5.79e-16 ***
## Adhes -0.3984 0.1294 -3.080 0.00207 **
## BNucl -0.4192 0.1020 -4.111 3.93e-05 ***
## Chrom -0.5679 0.1840 -3.085 0.00203 **
## Mitos -0.6456 0.3634 -1.777 0.07561 .
## NNucl -0.2915 0.1236 -2.358 0.01837 *
## Thick -0.6216 0.1579 -3.937 8.27e-05 ***
## UShap -0.2541 0.1785 -1.423 0.15461
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 881.388 on 680 degrees of freedom
## Residual deviance: 89.662 on 673 degrees of freedom
## AIC: 105.66
##
## Number of Fisher Scoring iterations: 8
# e. Compute number of errors with the reduced model
predicted_probs <- predict(selected_model, type = "response")
predicted_class <- ifelse(predicted_probs > 0.5, 1, 0)
conf_matrix <- table(predicted_class, wbca$Class)
errors <- sum(conf_matrix[c(2, 3)]) # Sum of false positives and false negatives
cat("Number of errors:", errors, "\n")
## Number of errors: 20