Getting started: Setting up the environment

Data Exploration

# Display the first few rows of the dataset
head(train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000

Explore the variables

# Select quantitative variables
quantitative_vars <- train %>%
  select_if(is.numeric)
head(quantitative_vars)
##   Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## 1  1         60          65    8450           7           5      2003
## 2  2         20          80    9600           6           8      1976
## 3  3         60          68   11250           7           5      2001
## 4  4         70          60    9550           7           5      1915
## 5  5         60          84   14260           8           5      2000
## 6  6         50          85   14115           5           5      1993
##   YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF
## 1         2003        196        706          0       150         856       856
## 2         1976          0        978          0       284        1262      1262
## 3         2002        162        486          0       434         920       920
## 4         1970          0        216          0       540         756       961
## 5         2000        350        655          0       490        1145      1145
## 6         1995          0        732          0        64         796       796
##   X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1       854            0      1710            1            0        2        1
## 2         0            0      1262            0            1        2        0
## 3       866            0      1786            1            0        2        1
## 4       756            0      1717            1            0        1        0
## 5      1053            0      2198            1            0        2        1
## 6       566            0      1362            1            0        1        1
##   BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 1            3            1            8          0        2003          2
## 2            3            1            6          1        1976          2
## 3            3            1            6          1        2001          2
## 4            3            1            7          1        1998          3
## 5            4            1            9          1        2000          3
## 6            1            1            5          0        1993          2
##   GarageArea WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 1        548          0          61             0          0           0
## 2        460        298           0             0          0           0
## 3        608          0          42             0          0           0
## 4        642          0          35           272          0           0
## 5        836        192          84             0          0           0
## 6        480         40          30             0        320           0
##   PoolArea MiscVal MoSold YrSold SalePrice
## 1        0       0      2   2008    208500
## 2        0       0      5   2007    181500
## 3        0       0      9   2008    223500
## 4        0       0      2   2006    140000
## 5        0       0     12   2008    250000
## 6        0     700     10   2009    143000
# Check skewness of each quantitative variable
skewness <- quantitative_vars %>%
  summarise(across(everything(), ~ skewness(.x))) %>%
  gather(variable, skewness) %>%
  arrange(desc(skewness)) %>%
  head(5)
# Print skewness of quantitative variables
print("Skewness of quantitative variables:")
## [1] "Skewness of quantitative variables:"
print(skewness)
##       variable  skewness
## 1      MiscVal 24.426522
## 2     PoolArea 14.797918
## 3      LotArea 12.182615
## 4   X3SsnPorch 10.283178
## 5 LowQualFinSF  8.992833
# Plot the distribution of a highly skewed variable
skewed_variable <- skewness$variable[1]
ggplot(data, aes_string(x =  skewed_variable)) +
  geom_histogram(bins = 30, fill = "seagreen", alpha = 0.7) +
  ggtitle(paste("Distribution of", skewed_variable)) +
  theme_minimal()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Select a right-skewed variable for X and define Y:

# Define X and Y
X <- train$LotArea
Y <- train$SalePrice

# Print first few values of X and Y
print("X (LotArea):")
## [1] "X (LotArea):"
head(X)
## [1]  8450  9600 11250  9550 14260 14115
print("Y (SalePrice):")
## [1] "Y (SalePrice):"
head(Y)
## [1] 208500 181500 223500 140000 250000 143000

Verify the skewness and plot the distribution of X to ensure it is right-skewed:

# Distribution of Lot Area, our selected X variable
skewed_variable <- "LotArea"

# Verify the skewness of LotArea
lotarea_skewness <- skewness(X, na.rm = TRUE)
print(paste("Skewness of LotArea:", lotarea_skewness))
## [1] "Skewness of LotArea: 12.182615015142"
# Plot the distribution of LotArea
ggplot(data, aes(x = LotArea)) +
  geom_histogram(bins = 30, fill = "pink", alpha = 0.7) +
  ggtitle("Distribution of LotArea") +
  theme_minimal()

Probability

Calculate the 3rd quartile of X and the 2nd quartile of Y:

# Calculate the 3rd quartile of X (LotArea)
x_3rd_quartile <- quantile(X, 0.75, na.rm = TRUE)

# Calculate the 2nd quartile (median) of Y (SalePrice)
y_2nd_quartile <- quantile(Y, 0.50, na.rm = TRUE)

# Print the quartiles
print(paste("3rd Quartile of X (LotArea):", x_3rd_quartile))
## [1] "3rd Quartile of X (LotArea): 11601.5"
print(paste("2nd Quartile of Y (SalePrice):", y_2nd_quartile))
## [1] "2nd Quartile of Y (SalePrice): 163000"

Calculate the probabilities:

  1. \(P(X>x∣Y>y)\)
  2. \(P(X>x,Y>y)\)
  3. \(P(X<x∣Y>y)\)
# Calculate P(X > x)
P_X_greater_x <- sum(X > x_3rd_quartile) / length(X)

# Calculate P(Y > y)
P_Y_greater_y <- sum(Y > y_2nd_quartile) / length(Y)

# Calculate P(X > x, Y > y)
P_X_greater_x_and_Y_greater_y <- sum(X > x_3rd_quartile & Y > y_2nd_quartile) / length(X)

# Calculate P(X > x | Y > y)
P_X_greater_x_given_Y_greater_y <- P_X_greater_x_and_Y_greater_y / P_Y_greater_y

# Calculate P(X < x | Y > y)
P_X_less_x_given_Y_greater_y <- 1 - P_X_greater_x_given_Y_greater_y

# Print the probabilities with 4 decimal digits
cat(sprintf("P(X > x | Y > y): %.4f\n", P_X_greater_x_given_Y_greater_y))
## P(X > x | Y > y): 0.3791
cat(sprintf("P(X > x, Y > y): %.4f\n", P_X_greater_x_and_Y_greater_y))
## P(X > x, Y > y): 0.1890
cat(sprintf("P(X < x | Y > y): %.4f\n", P_X_less_x_given_Y_greater_y))
## P(X < x | Y > y): 0.6209

Interpret the probabilities:

  1. $P(X>x∣Y>y): $ This is the conditional probability that Lot Area is greater than its 3rd quartile given that Sale Price is greater than its median. It tells us how likely a lot area is large if the house price is above the median. That probability is 0.3791. This means that given the Sale Price is above the median, there is a 37.91% chance that the Lo tArea is above the 3rd quartile.
  2. $P(X>x,Y>y): $ This is the joint probability that Lot Area is greater than its 3rd quartile and Sale Price is greater than its median. It gives the likelihood of both events happening simultaneously. That probability is estimated at 0.1890. This indicates that there is an 18.90% chance that both the Lot Area is above the 3rd quartile and the Sale Price is above the median simultaneously.
  3. \(P(X<x∣Y>y):\) This is the conditional probability that Lot Area is less than its 3rd quartile given that Sale Price is greater than its median. It tells us how likely a lot area is small if the house price is above the median. This probability is estimated at 0.6209. This means that given the Sale Price is above the median, there is a 62.09% chance that the Lot Area is below the 3rd quartile.
# Calculate the counts for the table
counts_table <- table(X <= quantile(X, 0.5), Y <= quantile(Y, 0.5))

# Add row and column names
rownames(counts_table) <- c("<= 3rd quartile", "> 3rd quartile")
colnames(counts_table) <- c("<= 2nd quartile", "> 2nd quartile")

# Add totals
counts_table <- addmargins(counts_table)

# Print the table
print("Counts Table:")
## [1] "Counts Table:"
print(counts_table)
##                  
##                   <= 2nd quartile > 2nd quartile  Sum
##   <= 3rd quartile             477            253  730
##   > 3rd quartile              251            479  730
##   Sum                         728            732 1460

Calculate the conditional probability of A given B

# Calculate the counts for A (observations above the 3rd quartile for X)
count_A <- sum(X > quantile(X, 0.75))

# Calculate the counts for B (observations above the 2nd quartile for Y)
count_B <- sum(Y > quantile(Y, 0.50))

# Calculate the counts for both A and B
count_A_and_B <- sum(X > quantile(X, 0.75) & Y > quantile(Y, 0.50))

# Calculate P(A|B)
P_A_given_B <- count_A_and_B / count_B

# Calculate P(A)
P_A <- count_A / length(X)

# Calculate P(B)
P_B <- count_B / length(Y)

print(P_A_given_B)
## [1] 0.3791209
print(P_A )
## [1] 0.25
print(P_B)
## [1] 0.4986301

Checking whether the conditional probability of A given B is equal to the probability of A times the probability of B

# Check if P(A|B) = P(A) * P(B)
if (round(P_A_given_B, 4) == round(P_A * P_B, 4)) {
  print("P(A|B) = P(A) * P(B)")
} else {
  print("P(A|B) != P(A) * P(B)")
}
## [1] "P(A|B) != P(A) * P(B)"

Chi-Square test for association

observed <- c(count_A_and_B, count_A - count_A_and_B, count_B - count_A_and_B, length(X) - (count_A + count_B - count_A_and_B))
expected <- c(P_A_given_B * count_B, (1 - P_A_given_B) * count_B, P_A_given_B * (length(X) - count_B), (1 - P_A_given_B) * (length(X) - count_B))
chisq <- sum((observed - expected)^2 / expected)

# Calculate the degrees of freedom
df <- 1

# Calculate the p-value
p_value <- pchisq(chisq, df, lower.tail = FALSE)

# Print the results
print(paste("Chi-Square statistic:", chisq))
## [1] "Chi-Square statistic: 479.422998091602"
print(paste("Degrees of freedom:", df))
## [1] "Degrees of freedom: 1"
print(paste("p-value:", p_value))
## [1] "p-value: 2.85297622258684e-106"

Descriptive and Inferential Statistics.

Univariate Descriptive Statistics and Plots:

# Univariate descriptive statistics
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
# Histogram of X (LotArea)
ggplot(train, aes(x = LotArea)) +
  geom_histogram(fill = "seagreen", alpha = 0.7) +
  ggtitle("Histogram of LotArea") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of Y (SalePrice)
ggplot(train, aes(x = SalePrice)) +
  geom_histogram(bins = 39, fill = "seagreen", alpha = 0.7) +
  ggtitle("Histogram of SalePrice") +
   theme_minimal()

Scatterplot of X and Y:

# Scatterplot of X and Y with 
ggplot(data, aes(x = LotArea, y = SalePrice)) +
  geom_point(color = "seagreen", alpha = 0.3) +  # Set color to blue and alpha to 0.5
  ggtitle("Scatterplot of LotArea vs SalePrice") +
  theme_minimal()

95% Confidence Interval for the Difference in Means:

# Calculate the difference in means
mean_diff <- mean(train$LotArea) - mean(train$SalePrice)

# Calculate standard error
se_diff <- sqrt(var(train$LotArea) / length(train$LotArea) + var(train$SalePrice) / length(train$SalePrice))

# Calculate the 95% confidence interval
ci_lower <- mean_diff - 1.96 * se_diff
ci_upper <- mean_diff + 1.96 * se_diff

# Print the confidence interval
print(paste("95% CI for the difference in means:", ci_lower, "-", ci_upper))
## [1] "95% CI for the difference in means: -174511.45214287 - -166297.283473569"
# Test the hypothesis that the correlation is 0 and provide a 99% confidence interval

# Convert X and Y to numeric, replacing non-numeric values with NA
train <- train %>% 
  mutate(X = as.numeric(X),
         Y = as.numeric(Y))

# Test the hypothesis that the correlation is 0 and provide a 99% confidence interval
cor_test_99 <- cor.test(train$X, train$Y, conf.level = 0.99)

# Print the results of the hypothesis test and confidence interval
print("Hypothesis Test Results (99% confidence level):")
## [1] "Hypothesis Test Results (99% confidence level):"
print(cor_test_99)
## 
##  Pearson's product-moment correlation
## 
## data:  train$X and train$Y
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.2000196 0.3254375
## sample estimates:
##       cor 
## 0.2638434

Correlation Matrix and Hypothesis Testing:

# Correlation matrix for LotArea and SalePrice
cor_matrix <- cor(train[c("LotArea", "SalePrice")])

# Print the correlation matrix
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
##             LotArea SalePrice
## LotArea   1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000
# Test the hypothesis that the correlation is 0
cor_test <- cor.test(train$LotArea, train$SalePrice)

# Print the results of the hypothesis test
print("Hypothesis Test Results:")
## [1] "Hypothesis Test Results:"
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  train$LotArea and train$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2154574 0.3109369
## sample estimates:
##       cor 
## 0.2638434

Meaning of the results obtained during Descriptive and Inferential Statistics

The above descriptive and inferential statistics analysis provided insights into the descriptive characteristics of variables X (“LotArea” ) and Y (SalePrice”), allowing us to assess the strength and significance of their relationship.

95% Confidence Interval for the Difference in Means:

The correlation matrix quantifies the strength and direction of the linear relationship between “LotArea” and “SalePrice”. - A correlation coefficient close to +1 indicates a strong positive linear relationship, close to -1 indicates a strong negative linear relationship, and close to 0 indicates no linear relationship. - The hypothesis test determines whether the observed correlation coefficient is significantly different from zero. A low p-value (typically below the chosen significance level, e.g., 0.05) indicates that the correlation is statistically significant, suggesting a meaningful relationship between “LotArea” and “SalePrice”.

Linear Algebra and Correlation.

# Load necessary library
library(psych)

# Invert the correlation matrix to obtain the precision matrix
precision_matrix <- solve(cor_matrix)

# Multiply the correlation matrix by the precision matrix
cor_precision_multiply <- cor_matrix %*% precision_matrix

# Multiply the precision matrix by the correlation matrix
precision_cor_multiply <- precision_matrix %*% cor_matrix

# Perform principal component analysis (PCA)
pca_result <- principal(cor_matrix, nfactors = 2, rotate = "none")

# Print the PCA results
print("PCA Results:")
## [1] "PCA Results:"
print(pca_result)
## Principal Components Analysis
## Call: principal(r = cor_matrix, nfactors = 2, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##            PC1   PC2 h2      u2 com
## LotArea   0.79 -0.61  1 1.1e-16 1.9
## SalePrice 0.79  0.61  1 1.1e-16 1.9
## 
##                        PC1  PC2
## SS loadings           1.26 0.74
## Proportion Var        0.63 0.37
## Cumulative Var        0.63 1.00
## Proportion Explained  0.63 0.37
## Cumulative Proportion 0.63 1.00
## 
## Mean item complexity =  1.9
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0 
## 
## Fit based upon off diagonal values = 1
# Interpretation
# The PCA results provide information about the underlying structure of the variables.
# The loadings indicate the correlation between the original variables and the principal components.
# The eigenvalues represent the variance explained by each principal component.
# The proportion of variance explained by each component can be used to assess the contribution of each component to the total variance.
# By examining the loadings and eigenvalues, we can identify which variables contribute most to each principal component and understand the relationships between variables in the dataset.

Discussion:

PCA helps in identifying patterns and relationships between variables in the dataset. It reduces the dimensionality of the data while retaining most of its variability, making it easier to interpret and visualize. By examining the loadings and eigenvalues, we gain insights into the underlying structure of the dataset and the relationships between variables. PCA can be used for data reduction, feature selection, and data visualization, making it a valuable tool in exploratory data analysis and dimensionality reduction techniques.

Calculus-Based Probability & Statistics.

Shift the Skewed Variable: Shift the “LotArea” variable so that the minimum value is above zero.

shifted_LotArea <- train$LotArea - min(train$LotArea) + 1  # Shifted variable with minimum value above zero
head(shifted_LotArea)
## [1]  7151  8301  9951  8251 12961 12816

Fit Exponential Distribution:

library(MASS)
fit_exp <- fitdistr(shifted_LotArea, "exponential")  # Fit exponential distribution
lambda <- fit_exp$estimate  # Optimal lambda value
head(lambda)
##         rate 
## 0.0001084854

Generate Samples from Exponential Distribution

samples_exp <- rexp(1000, lambda)  # Generate 1000 samples from exponential distribution
head(samples_exp)
## [1] 16764.94038  7031.50121    28.57506  8234.16895   799.68282  4037.66132

Plot Histograms

hist(shifted_LotArea, main = "Histogram of Shifted LotArea", xlab = "Shifted LotArea", col = "seagreen", breaks = 30)

hist(samples_exp, main = "Histogram of Samples from Exponential Distribution", xlab = "Samples", col = "pink", breaks = 30)

Calculate Percentiles

percentile_5 <- qexp(0.05, rate = lambda)  # 5th percentile using exponential distribution
percentile_95 <- qexp(0.95, rate = lambda)  # 95th percentile using exponential distribution
percentile_5
## [1] 472.8128
percentile_95
## [1] 27614.15

Calculate Confidence Interval & Calculate Empirical Percentiles

# Calculate Empirical Percentiles
mean_LotArea <- mean(shifted_LotArea)
sd_LotArea <- sd(shifted_LotArea)
conf_interval <- c(mean_LotArea - 1.96 * sd_LotArea / sqrt(length(shifted_LotArea)),
                   mean_LotArea + 1.96 * sd_LotArea / sqrt(length(shifted_LotArea)))

# Calculate Empirical Percentiles
empirical_percentile_5 <- quantile(shifted_LotArea, 0.05)
empirical_percentile_95 <- quantile(shifted_LotArea, 0.95)

Display results:

# Print Results
print("Optimal Lambda for Exponential Distribution:")
## [1] "Optimal Lambda for Exponential Distribution:"
print(lambda)
##         rate 
## 0.0001084854
print("5th and 95th Percentiles using Exponential Distribution:")
## [1] "5th and 95th Percentiles using Exponential Distribution:"
print(c(percentile_5, percentile_95))
## [1]   472.8128 27614.1451
print("95% Confidence Interval from Empirical Data:")
## [1] "95% Confidence Interval from Empirical Data:"
print(conf_interval)
## [1] 8705.834 9729.823
print("Empirical 5th and 95th Percentiles of Shifted LotArea:")
## [1] "Empirical 5th and 95th Percentiles of Shifted LotArea:"

Discussion:

The results from this section help us understand the distributional properties of the “LotArea” variable and assess the suitability of the exponential distribution as a model. They provide valuable information for making inferences and decisions based on the data. More specifically: - By fitting an exponential probability density function to the shifted “LotArea” variable, we attempted to model its distribution with a known parametric form.The optimal λ value obtained from the fit represents the rate parameter of the exponential distribution, indicating the average rate of occurrence for “LotArea” values. - The comparison between the histograms of the original data and the fitted exponential distribution helps us assess the goodness-of-fit. Deviations between the two histograms may suggest limitations in the assumed distribution.In other words, this visual comparison of the histograms is an easily way to assess how well the exponential distribution fits the empirical data. If the histograms match closely, it suggests a good fit.

  • Calculation of percentiles provides information about the spread and central tendency of the distribution.
  • The 95% confidence interval we generated estimates the range within which the population mean of the shifted “LotArea” variable is likely to lie, assuming normality.
  • We also calculated the empirical 5th and 95th percentiles directly from the shifted “LotArea” data.These percentiles provide a measure of the distribution’s spread based on observed data.

Modeling

Simple linear regression

data <- read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/main/train_csv.csv")
train = data

# Fit linear regression model
lm_model <- lm(SalePrice ~ LotArea, data = train)

# Get model summary
summary(lm_model)
## 
## Call:
## lm(formula = SalePrice ~ LotArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -275668  -48169  -17725   31248  553356 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.588e+05  2.915e+03   54.49   <2e-16 ***
## LotArea     2.100e+00  2.011e-01   10.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76650 on 1458 degrees of freedom
## Multiple R-squared:  0.06961,    Adjusted R-squared:  0.06898 
## F-statistic: 109.1 on 1 and 1458 DF,  p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(lm_model)

hist(lm_model$residuals)

Let’s apply a Box-Cox Transformation to our model, in order to stabilize the variance and make the data more normally distributed.

A Box-Cox transformation is “an optimization algorithm designed to find the best powers to raise each variable so that the resulting distribution cannot be rejected.”(Prof.Larry Fulton)

# Load necessary libraries
library(MASS)

# Fit initial linear regression model
lm_model <- lm(SalePrice ~ LotArea, data = train)

# Apply Box-Cox transformation
boxcox_result <- boxcox(lm_model, lambda = seq(-2, 2, by = 0.1))

# Find the lambda value that maximizes the log-likelihood
lambda_optimal <- boxcox_result$x[which.max(boxcox_result$y)]

# Display the optimal lambda value
cat("Optimal lambda for Box-Cox transformation:", lambda_optimal, "\n")
## Optimal lambda for Box-Cox transformation: -0.06060606
# Transform the response variable using the optimal lambda
if(lambda_optimal == 0) {
  train$SalePrice_transformed <- log(train$SalePrice)
} else {
  train$SalePrice_transformed <- (train$SalePrice^lambda_optimal - 1) / lambda_optimal
}

# Fit the linear regression model again using the transformed response variable
lm_model_transformed <- lm(SalePrice_transformed ~ LotArea, data = train)

# Get model summary
summary(lm_model_transformed)
## 
## Call:
## lm(formula = SalePrice_transformed ~ LotArea, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.77614 -0.11315 -0.00998  0.11754  0.66770 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.484e+00  7.081e-03  1198.1   <2e-16 ***
## LotArea     4.936e-06  4.885e-07    10.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1862 on 1458 degrees of freedom
## Multiple R-squared:  0.06544,    Adjusted R-squared:  0.0648 
## F-statistic: 102.1 on 1 and 1458 DF,  p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(lm_model_transformed )

hist(lm_model_transformed$residuals)

Multiple Regression

# Fit A multiple regression model
# Load necessary libraries
library(dplyr)

# Load the dataset
data <- read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/main/train_csv.csv")
train = data

# Convert categorical variables to factors
train$KitchenQual <- as.factor(train$KitchenQual)

# Fit the multiple regression model with the specified variables, excluding Neighborhood
lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual+OverallCond+ Neighborhood,data = train)

# Get model summary
summary(lmModel)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + 
##     KitchenQual + OverallCond + Neighborhood, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -333611  -16485    -432   15491  245336 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.263e+04  1.203e+04   7.700 2.52e-14 ***
## LotArea              5.993e-01  1.054e-01   5.688 1.56e-08 ***
## GrLivArea            5.782e+01  2.361e+00  24.485  < 2e-16 ***
## PoolArea             1.202e+01  2.377e+01   0.506 0.613094    
## GarageCars           1.741e+04  1.721e+03  10.118  < 2e-16 ***
## KitchenQualFa       -7.705e+04  7.742e+03  -9.952  < 2e-16 ***
## KitchenQualGd       -5.935e+04  4.307e+03 -13.780  < 2e-16 ***
## KitchenQualTA       -7.283e+04  4.797e+03 -15.183  < 2e-16 ***
## OverallCond          6.507e+03  9.434e+02   6.898 7.92e-12 ***
## NeighborhoodBlueste -3.764e+04  2.663e+04  -1.414 0.157708    
## NeighborhoodBrDale  -3.829e+04  1.267e+04  -3.023 0.002550 ** 
## NeighborhoodBrkSide -3.144e+04  1.025e+04  -3.068 0.002195 ** 
## NeighborhoodClearCr -7.519e+03  1.141e+04  -0.659 0.510160    
## NeighborhoodCollgCr  4.034e+03  9.120e+03   0.442 0.658330    
## NeighborhoodCrawfor  2.698e+02  1.031e+04   0.026 0.979126    
## NeighborhoodEdwards -3.664e+04  9.698e+03  -3.778 0.000165 ***
## NeighborhoodGilbert -6.373e+03  9.596e+03  -0.664 0.506727    
## NeighborhoodIDOTRR  -4.518e+04  1.081e+04  -4.181 3.08e-05 ***
## NeighborhoodMeadowV -4.026e+04  1.253e+04  -3.214 0.001338 ** 
## NeighborhoodMitchel -1.441e+04  1.023e+04  -1.408 0.159460    
## NeighborhoodNAmes   -2.358e+04  9.268e+03  -2.544 0.011062 *  
## NeighborhoodNoRidge  6.332e+04  1.051e+04   6.024 2.17e-09 ***
## NeighborhoodNPkVill -2.397e+04  1.479e+04  -1.621 0.105217    
## NeighborhoodNridgHt  5.674e+04  9.713e+03   5.842 6.38e-09 ***
## NeighborhoodNWAmes  -1.810e+04  9.835e+03  -1.840 0.065916 .  
## NeighborhoodOldTown -5.417e+04  9.641e+03  -5.619 2.31e-08 ***
## NeighborhoodSawyer  -2.470e+04  9.895e+03  -2.496 0.012677 *  
## NeighborhoodSawyerW -7.948e+03  9.842e+03  -0.808 0.419497    
## NeighborhoodSomerst  1.657e+04  9.422e+03   1.759 0.078827 .  
## NeighborhoodStoneBr  6.649e+04  1.124e+04   5.913 4.19e-09 ***
## NeighborhoodSWISU   -4.508e+04  1.167e+04  -3.864 0.000117 ***
## NeighborhoodTimber   1.475e+04  1.057e+04   1.395 0.163104    
## NeighborhoodVeenker  2.401e+04  1.387e+04   1.731 0.083676 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35410 on 1427 degrees of freedom
## Multiple R-squared:  0.8056, Adjusted R-squared:  0.8013 
## F-statistic: 184.8 on 32 and 1427 DF,  p-value: < 2.2e-16

Key Findings and main takeaways from the model summary and results:

Residuals:

Min 1Q Median 3Q Max -333611 -16485 -432 15491 245336

The residuals represent the difference between observed and predicted values. The median residual being close to zero suggests that the model is unbiased, though the wide range indicates there are some large discrepancies between predicted and actual sale prices.

Significant Predictors:

  • LotArea and GrLivArea have a positive effect on SalePrice.

  • GarageCars significantly increases SalePrice.

  • Kitchen Quality has a strong impact, with lower qualities (Fair, Good, Typical/Average) significantly reducing the sale price compared to Excellent quality.

  • Overall Condition positively affects the sale price. Several Neighborhoods significantly affect sale prices, either positively or negatively.

    Non-significant Predictors:

  • PoolArea does not significantly affect the sale price and Many neighborhoods do not have a significant impact compared to the baseline. Model Fit

  • Our model explains a substantial portion of the variance (80.56%), indicating a good fit. The residual standard error suggests some variability in sale prices is not explained by the model, indicating room for improvement.

Overall, our regression model provides a comprehensive understanding of the factors influencing house prices. Significant predictors like lot area, living area, garage capacity, kitchen quality, and overall condition are crucial determinants of sale prices. Additionally, neighborhood effects highlight the importance of location. The model fits the data well, explaining over 80% of the variance, but there is potential for further refinement and improvement.

Model Diagnostics: Checking the assumptions of linearity for the lmModel we have created:

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
# checking for multicollinearity

# Fit the multiple regression model again 
lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual + OverallCond + Neighborhood, data = train)

# Check for multicollinearity using Variance Inflation Factor (VIF)
vif_values <- vif(lmModel)
print(vif_values)
##                  GVIF Df GVIF^(1/(2*Df))
## LotArea      1.286673  1        1.134316
## GrLivArea    1.791208  1        1.338360
## PoolArea     1.061342  1        1.030215
## GarageCars   1.923633  1        1.386951
## KitchenQual  2.684020  3        1.178866
## OverallCond  1.282228  1        1.132355
## Neighborhood 5.006971 24        1.034128

Interpreting the multicollinearity check of our model

The multicollinearity check of our model reveals that: - LotArea, GrLivArea, PoolArea, GarageCars, and OverallCond have VIF values close to 1, indicating low multicollinearity. - KitchenQual has a VIF of 2.68, which is slightly above 1, but still within an acceptable range, suggesting moderate multicollinearity. - Neighborhood has a VIF of 5.01, which is higher than the usual threshold of 5, indicating potential multicollinearity issues. This suggests that the Neighborhood variable may have strong linear relationships with other predictor variables in the model. Action: We will apply a regularization technique called box transformation

# Check for heteroscedasticity using the score test
score_test <- ncvTest(lmModel)
print(score_test)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1669.506, Df = 1, p = < 2.22e-16

In this case, the p-value is extremely small (p < 2.22e-16), indicating strong evidence against the null hypothesis of homoscedasticity. Therefore, we reject the null hypothesis and conclude that there is heteroscedasticity present in the model. Because the presence of heteroscedasticity can lead to inefficient and biased estimates of the coefficients, we will apply a box- transformation algorithm to our model at to mitigate heteroscedasticity issue.

# Plot residuals vs fitted values to visually inspect heteroscedasticity
plot(lmModel, which = 1)

# Check for normality of residuals using the Shapiro-Wilk test
shapiro_test_result <- shapiro.test(lmModel$residuals)
print(shapiro_test_result)
## 
##  Shapiro-Wilk normality test
## 
## data:  lmModel$residuals
## W = 0.86935, p-value < 2.2e-16
# Plot Q-Q plot of residuals to visually inspect normality
qqnorm(lmModel$residuals)
qqline(lmModel$residuals)

# Plot histogram of residuals
hist(lmModel$residuals, breaks = 30, main = "Histogram of Residuals", xlab = "Residuals")

# Additional diagnostic plots
par(mfrow = c(1, 1))
plot(lmModel)

par(mfrow = c(1, 1))
# Load necessary library
library(car)

# Check if lmModel contains predictor variables
if (length(lmModel$coefficients) > 1) {
  # Calculate VIF values
  vif_values <- vif(lmModel)
  
  # Print VIF values
  print(vif_values)
} else {
  print("No predictor variables found in the linear regression model.")
}
##                  GVIF Df GVIF^(1/(2*Df))
## LotArea      1.286673  1        1.134316
## GrLivArea    1.791208  1        1.338360
## PoolArea     1.061342  1        1.030215
## GarageCars   1.923633  1        1.386951
## KitchenQual  2.684020  3        1.178866
## OverallCond  1.282228  1        1.132355
## Neighborhood 5.006971 24        1.034128
# Visual check for heteroscedasticity
ggplot(train, aes(fitted(lmModel), residuals(lmModel))) + 
  geom_point() + 
  geom_smooth(method = "lm") + 
  labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals")
## `geom_smooth()` using formula = 'y ~ x'

# Visual check for normality
qqplot <- ggplot(data.frame(residuals = lmModel$residuals), aes(sample = residuals)) +
  stat_qq() + 
  stat_qq_line() + 
  labs(title = "Q-Q Plot of Residuals", x = "Theoretical Quantiles", y = "Sample Quantiles")
# Histogram of residuals
histogram <- ggplot(data.frame(residuals = lmModel$residuals), aes(x = residuals)) + 
  geom_histogram(aes(y = ..density..), bins = 30) + 
  geom_density(col = "seagreen") + 
  labs(title = "Histogram of Residuals", x = "Residuals", y = "Density")
# Combine the plots
grid.arrange(qqplot, histogram, ncol = 2)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Box-Transformation of our lmModel

# Fit the linear regression model
lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual + OverallCond + Neighborhood, data = train)

# Determine optimal lambda value for Box-Cox transformation
boxcox_lambda <- boxcox(lmModel)

# Apply Box-Cox transformation using the optimal lambda
transformed_lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual + OverallCond + Neighborhood, 
                          data = train, 
                          lambda = boxcox_lambda$x[which.max(boxcox_lambda$y)])
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'lambda' will be disregarded
hist(transformed_lmModel$residuals)