```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
# Read the dataset
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
str(ames)
## 'data.frame': 2930 obs. of 82 variables:
## $ Order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ PID : int 526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
## $ MS.SubClass : int 20 20 20 20 60 60 120 120 120 60 ...
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "NAmes" "NAmes" "NAmes" "NAmes" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" NA NA "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pool.QC : chr NA NA NA NA ...
## $ Fence : chr NA "MnPrv" NA NA ...
## $ Misc.Feature : chr NA NA "Gar2" NA ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Mo.Sold : int 5 6 6 4 3 6 4 1 3 6 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
ames <- ames %>%
mutate(Central.Air = ifelse(Central.Air == "Y", 1, 0))
The Central.Air variable is converted into a binary format with 1 representing houses with central air conditioning (Y) and 0 representing houses without it (N).
# Selecting relevant columns for the logistic regression model
# We include Gr.Liv.Area, OverallQual, YearBuilt, and LotArea as explanatory variables
ames_model_data <- ames %>%
select(Central.Air, Gr.Liv.Area, Overall.Qual, Year.Built, Lot.Area) %>%
na.omit()
We select only the necessary columns for modeling: Central.Air (response variable), and Gr.Liv.Area, Overall.Qual, Year.Built, and Lot.Area as explanatory variables.
The na.omit() function removes any rows with missing values, ensuring a clean dataset for analysis.
# Fit the logistic regression model
logit_model <- glm(Central.Air ~ Gr.Liv.Area + Overall.Qual + Year.Built + Lot.Area,
data = ames_model_data, family = binomial(link = "logit"))
We fit a logistic regression model with Central.Air as the binary response variable.
glm() is used to fit the model, specifying family = binomial(link = “logit”) to use the logistic regression link function.
The model estimates the probability of a house having central air conditioning based on the explanatory variables (Gr.Liv.Area, Overall.Qual, Year.Built, and Lot.Area).
# Summary of the logistic regression model
summary(logit_model)
##
## Call:
## glm(formula = Central.Air ~ Gr.Liv.Area + Overall.Qual + Year.Built +
## Lot.Area, family = binomial(link = "logit"), data = ames_model_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.668e+01 7.345e+00 -13.162 < 2e-16 ***
## Gr.Liv.Area 2.246e-04 2.333e-04 0.963 0.3357
## Overall.Qual 7.117e-01 8.922e-02 7.976 1.51e-15 ***
## Year.Built 4.870e-02 3.779e-03 12.888 < 2e-16 ***
## Lot.Area 3.727e-05 2.195e-05 1.698 0.0894 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1438.81 on 2929 degrees of freedom
## Residual deviance: 943.58 on 2925 degrees of freedom
## AIC: 953.58
##
## Number of Fisher Scoring iterations: 7
Overall.Qual and Year.Built are the most significant predictors (p < 0.001), indicating that higher quality and newer houses are more likely to have central air conditioning.
Gr.Liv.Area and Lot.Area have much higher p-values, suggesting they have little to no statistically significant effect on central air presence in this model.
The drop in deviance (from 1438.81 to 943.58) shows the model explains a substantial amount of the variation, and the AIC of 953.58 suggests a reasonably good fit.
# Interpret coefficients: convert log-odds to odds ratios for easier interpretation
exp(coef(logit_model))
## (Intercept) Gr.Liv.Area Overall.Qual Year.Built Lot.Area
## 1.029973e-42 1.000225e+00 2.037369e+00 1.049906e+00 1.000037e+00
The odds ratio for Overall.Qual (2.037) indicates that for each one-unit increase in overall quality, the odds of having central air conditioning more than double. In contrast, the odds ratios for Gr.Liv.Area (1.0002) and Lot.Area (1.000037) are very close to 1, suggesting these variables have a negligible effect on the likelihood of central air presence.
If Overall.Qual has an odds ratio of, say, 1.3, each unit increase in overall quality increases the odds of having central air by 30%.
# Calculate Confidence Interval for Gr.Liv.Area coefficient
# Extract the coefficient and its standard error
coeff_se <- summary(logit_model)$coefficients["Gr.Liv.Area", "Std. Error"]
coeff <- coef(logit_model)["Gr.Liv.Area"]
# Calculate 95% CI for Gr.Liv.Area coefficient
z_value <- 1.96 # Approximate z-value for 95% CI
ci_lower <- coeff - z_value * coeff_se
ci_upper <- coeff + z_value * coeff_se
ci <- c(ci_lower, ci_upper)
We extract the standard error (std.Error) and estimated coefficient for Gr.Liv.Area.
Using these values, we calculate the 95% confidence interval for the coefficient. We do this by subtracting and adding z_value * coeff_se to the coefficient.
A 95% CI means we are 95% confident that the actual coefficient lies within this range.
If the CI for Gr.Liv.Area is [1.05, 1.20], we conclude that, with 95% confidence, each additional square foot in living area is associated with a 5% to 20% increase in the odds of having central air conditioning.
# Convert to odds ratio for interpretability
ci_odds <- exp(ci)
We exponentiate the confidence interval values to interpret them as odds ratios.
For example, if the odds ratio CI for Gr.Liv.Area ranges from 1.05 to 1.20, it means that each additional square foot in Gr.Liv.Area is associated with a 5% to 20% increase in the odds of having central air.
# Output results
cat("Logistic Regression Model Coefficients (Odds Ratios):\n")
## Logistic Regression Model Coefficients (Odds Ratios):
print(exp(coef(logit_model)))
## (Intercept) Gr.Liv.Area Overall.Qual Year.Built Lot.Area
## 1.029973e-42 1.000225e+00 2.037369e+00 1.049906e+00 1.000037e+00
cat("\n95% Confidence Interval for Gr.Liv.Area (Odds Ratio):\n")
##
## 95% Confidence Interval for Gr.Liv.Area (Odds Ratio):
print(ci_odds)
## Gr.Liv.Area Gr.Liv.Area
## 0.9997674 1.0006820
The output presents the odds ratios for the logistic regression model, indicating that the likelihood of having central air significantly increases with overall quality (odds ratio of 2.037), while living area and lot size have negligible impacts (odds ratios close to 1). The 95% confidence interval for Gr.Liv.Area (0.9998 to 1.0007) suggests that the effect of living area on the odds of having central air is very weak, as it includes 1, indicating no statistically significant association.