# Load libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data("midwest")

##Modeling “inmetro”(County considered in a metro area) as a binary response variable with “perchsd”(Percent with high school diploma) as the predictor variable:

model1 <- glm(inmetro ~ perchsd , data = midwest, family = "binomial")
summary(model1)
## 
## Call:
## glm(formula = inmetro ~ perchsd, family = "binomial", data = midwest)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9400  -0.8387  -0.5079   0.9084   2.8821  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -18.12529    2.13413  -8.493   <2e-16 ***
## perchsd       0.23316    0.02821   8.265   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.13  on 436  degrees of freedom
## Residual deviance: 456.69  on 435  degrees of freedom
## AIC: 460.69
## 
## Number of Fisher Scoring iterations: 5

In this logistic regression model, it seems that perchsd is a significant predictor of the response variable inmetro.

##Build a logistic regression model for this variable, using between 1-4 explanatory variables

model2 <- glm(inmetro ~ perchsd + percollege + percprof , data = midwest, family = "binomial")
summary(model2)
## 
## Call:
## glm(formula = inmetro ~ perchsd + percollege + percprof, family = "binomial", 
##     data = midwest)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4328  -0.7768  -0.5023   0.8788   2.5610  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -11.92435    2.57774  -4.626 3.73e-06 ***
## perchsd       0.12208    0.04022   3.036   0.0024 ** 
## percollege    0.07099    0.05326   1.333   0.1826    
## percprof      0.18546    0.12369   1.499   0.1338    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.13  on 436  degrees of freedom
## Residual deviance: 438.76  on 433  degrees of freedom
## AIC: 446.76
## 
## Number of Fisher Scoring iterations: 5

##Using the Standard Error for at least one coefficient, build a C.I.

se <- summary(model1)$coefficients["perchsd", "Std. Error"]
confidence_level <- 0.95 #95%

critical_value_upper <- qnorm((1 + confidence_level) / 2)
critical_value_lower <- qnorm((1 - confidence_level) / 2)

margin_error <- se * critical_value_upper

lower_bound <- coef(model1)["perchsd"] - margin_error
upper_bound <- coef(model1)["perchsd"] + margin_error

cat("95% Confidence Interval for perchsd are:", lower_bound, "to", upper_bound, "\n")
## 95% Confidence Interval for perchsd are: 0.1778686 to 0.2884512

##using direct method:

#95% confidence intervals for model 1
CI_model1 <- confint(model1)
## Waiting for profiling to be done...
CI_model1
##                   2.5 %      97.5 %
## (Intercept) -22.5255890 -14.1421652
## perchsd       0.1804647   0.2912893

These intervals provide a range of values within which we can be 95% confident that the true coefficients lie. ->the effect of “perchsd” on the outcome variable is likely to be between 0.1804647 and 0.2912893. ->Narrower CIs indicate more precise estimates

#95% confidence intervals for model 2
CI_model2 <- confint(model2)
## Waiting for profiling to be done...
CI_model2
##                    2.5 %     97.5 %
## (Intercept) -17.20139733 -7.0838504
## perchsd       0.04562428  0.2035174
## percollege   -0.03299116  0.1763548
## percprof     -0.05196175  0.4336028
plot(model1)

The simplest way to detect heteroscedasticity is with a fitted value vs. residual plot. If residuals become much more spread out as the fitted values get larger forming a “cone” shape ->this is a sign of heteroscedasticity, which is not present here.

ggplot(midwest, aes(x = perchsd, y = inmetro)) +
  geom_point() +                   
  geom_smooth(method = "loess") +  
  xlab("perchsd") +                
  ylab("inmetro") +                
  ggtitle("Scatter Plot")  
## `geom_smooth()` using formula = 'y ~ x'

In the logistic regression technique, variable transformation is done to improve the fit of the model on the data. Some variable transformation functions are Natural Log, Square, Square-root, Exponential, Scaling (Standardization and Normalization), and Binning/ Bucketing.

why we use transformations are:

This model1 fit is good and does not seem to need transformation to improve fit.

I can still check for transformation for the below model by simply taking the log of the dependent variable. Eg: Using population size (poptotal) to predict the whether it is a metro area (‘inmetro’-dependent variable)

midwest$inmetro_transformed <- log(midwest$inmetro + 1)  # Adding 1 to avoid log(0)

model_transformed <- lm(midwest$inmetro_transformed ~ poptotal, data = midwest)
summary(model_transformed)
## 
## Call:
## lm(formula = midwest$inmetro_transformed ~ poptotal, data = midwest)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2516 -0.2159 -0.2104  0.4272  0.4862 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.052e-01  1.577e-02  13.008  < 2e-16 ***
## poptotal    3.407e-07  5.040e-08   6.761 4.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3138 on 435 degrees of freedom
## Multiple R-squared:  0.09509,    Adjusted R-squared:  0.09301 
## F-statistic: 45.71 on 1 and 435 DF,  p-value: 4.425e-11

this linear regression model with a transformed response variable suggests that there is a statistically significant relationship between the natural logarithm of ‘inmetro’ and ‘poptotal.’

model_reg <- lm(inmetro ~ poptotal, data = midwest)
summary(model_reg)
## 
## Call:
## lm(formula = inmetro ~ poptotal, data = midwest)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8056 -0.3115 -0.3035  0.6163  0.7014 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.960e-01  2.276e-02  13.008  < 2e-16 ***
## poptotal    4.916e-07  7.271e-08   6.761 4.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4527 on 435 degrees of freedom
## Multiple R-squared:  0.09509,    Adjusted R-squared:  0.09301 
## F-statistic: 45.71 on 1 and 435 DF,  p-value: 4.425e-11

this linear regression model suggests that there is a statistically significant relationship between ‘inmetro’ and ‘poptotal.’

Both models have the same R-squared and adjusted R-squared values, indicating that they explain a similar proportion of the variance in the response variable. -Transformation here stabilizes variance and improves the interpretability of the model.