```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Load necessary libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(broom)

Load the data

# Read the dataset
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
str(ames)
## 'data.frame':    2930 obs. of  82 variables:
##  $ Order          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ PID            : int  526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
##  $ MS.SubClass    : int  20 20 20 20 60 60 120 120 120 60 ...
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  NA NA NA NA ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities      : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood   : chr  "NAmes" "NAmes" "NAmes" "NAmes" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" NA NA "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pool.QC        : chr  NA NA NA NA ...
##  $ Fence          : chr  NA "MnPrv" NA NA ...
##  $ Misc.Feature   : chr  NA NA "Gar2" NA ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Mo.Sold        : int  5 6 6 4 3 6 4 1 3 6 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...

Converting ‘Central.Air’ to binary (1 for “Y”, 0 for “N”)

ames <- ames %>%
  mutate(Central.Air = ifelse(Central.Air == "Y", 1, 0))

The Central.Air variable is converted into a binary format with 1 representing houses with central air conditioning (Y) and 0 representing houses without it (N).

# Selecting relevant columns for the logistic regression model
# We include Gr.Liv.Area, OverallQual, YearBuilt, and LotArea as explanatory variables
ames_model_data <- ames %>%
  select(Central.Air, Gr.Liv.Area, Overall.Qual, Year.Built, Lot.Area) %>%
  na.omit()

Fitting Logistic Regression Model

# Fit the logistic regression model
logit_model <- glm(Central.Air ~ Gr.Liv.Area + Overall.Qual + Year.Built + Lot.Area, 
                   data = ames_model_data, family = binomial(link = "logit"))
# Summary of the logistic regression model
summary(logit_model)
## 
## Call:
## glm(formula = Central.Air ~ Gr.Liv.Area + Overall.Qual + Year.Built + 
##     Lot.Area, family = binomial(link = "logit"), data = ames_model_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -9.668e+01  7.345e+00 -13.162  < 2e-16 ***
## Gr.Liv.Area   2.246e-04  2.333e-04   0.963   0.3357    
## Overall.Qual  7.117e-01  8.922e-02   7.976 1.51e-15 ***
## Year.Built    4.870e-02  3.779e-03  12.888  < 2e-16 ***
## Lot.Area      3.727e-05  2.195e-05   1.698   0.0894 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1438.81  on 2929  degrees of freedom
## Residual deviance:  943.58  on 2925  degrees of freedom
## AIC: 953.58
## 
## Number of Fisher Scoring iterations: 7
# Interpret coefficients: convert log-odds to odds ratios for easier interpretation
exp(coef(logit_model))
##  (Intercept)  Gr.Liv.Area Overall.Qual   Year.Built     Lot.Area 
## 1.029973e-42 1.000225e+00 2.037369e+00 1.049906e+00 1.000037e+00

The odds ratio for Overall.Qual (2.037) indicates that for each one-unit increase in overall quality, the odds of having central air conditioning more than double. In contrast, the odds ratios for Gr.Liv.Area (1.0002) and Lot.Area (1.000037) are very close to 1, suggesting these variables have a negligible effect on the likelihood of central air presence.

If Overall.Qual has an odds ratio of, say, 1.3, each unit increase in overall quality increases the odds of having central air by 30%.

# Calculate Confidence Interval for Gr.Liv.Area coefficient
# Extract the coefficient and its standard error
coeff_se <- summary(logit_model)$coefficients["Gr.Liv.Area", "Std. Error"]
coeff <- coef(logit_model)["Gr.Liv.Area"]

# Calculate 95% CI for Gr.Liv.Area coefficient
z_value <- 1.96 # Approximate z-value for 95% CI
ci_lower <- coeff - z_value * coeff_se
ci_upper <- coeff + z_value * coeff_se
ci <- c(ci_lower, ci_upper)
# Convert to odds ratio for interpretability
ci_odds <- exp(ci)
# Output results
cat("Logistic Regression Model Coefficients (Odds Ratios):\n")
## Logistic Regression Model Coefficients (Odds Ratios):
print(exp(coef(logit_model)))
##  (Intercept)  Gr.Liv.Area Overall.Qual   Year.Built     Lot.Area 
## 1.029973e-42 1.000225e+00 2.037369e+00 1.049906e+00 1.000037e+00
cat("\n95% Confidence Interval for Gr.Liv.Area (Odds Ratio):\n")
## 
## 95% Confidence Interval for Gr.Liv.Area (Odds Ratio):
print(ci_odds)
## Gr.Liv.Area Gr.Liv.Area 
##   0.9997674   1.0006820

The output presents the odds ratios for the logistic regression model, indicating that the likelihood of having central air significantly increases with overall quality (odds ratio of 2.037), while living area and lot size have negligible impacts (odds ratios close to 1). The 95% confidence interval for Gr.Liv.Area (0.9998 to 1.0007) suggests that the effect of living area on the odds of having central air is very weak, as it includes 1, indicating no statistically significant association.