# Display the first few rows of the dataset
head(train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
# Select quantitative variables
quantitative_vars <- train %>%
select_if(is.numeric)
head(quantitative_vars)
## Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## 1 1 60 65 8450 7 5 2003
## 2 2 20 80 9600 6 8 1976
## 3 3 60 68 11250 7 5 2001
## 4 4 70 60 9550 7 5 1915
## 5 5 60 84 14260 8 5 2000
## 6 6 50 85 14115 5 5 1993
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF
## 1 2003 196 706 0 150 856 856
## 2 1976 0 978 0 284 1262 1262
## 3 2002 162 486 0 434 920 920
## 4 1970 0 216 0 540 756 961
## 5 2000 350 655 0 490 1145 1145
## 6 1995 0 732 0 64 796 796
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1 854 0 1710 1 0 2 1
## 2 0 0 1262 0 1 2 0
## 3 866 0 1786 1 0 2 1
## 4 756 0 1717 1 0 1 0
## 5 1053 0 2198 1 0 2 1
## 6 566 0 1362 1 0 1 1
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 1 3 1 8 0 2003 2
## 2 3 1 6 1 1976 2
## 3 3 1 6 1 2001 2
## 4 3 1 7 1 1998 3
## 5 4 1 9 1 2000 3
## 6 1 1 5 0 1993 2
## GarageArea WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 1 548 0 61 0 0 0
## 2 460 298 0 0 0 0
## 3 608 0 42 0 0 0
## 4 642 0 35 272 0 0
## 5 836 192 84 0 0 0
## 6 480 40 30 0 320 0
## PoolArea MiscVal MoSold YrSold SalePrice
## 1 0 0 2 2008 208500
## 2 0 0 5 2007 181500
## 3 0 0 9 2008 223500
## 4 0 0 2 2006 140000
## 5 0 0 12 2008 250000
## 6 0 700 10 2009 143000
# Check skewness of each quantitative variable
skewness <- quantitative_vars %>%
summarise(across(everything(), ~ skewness(.x))) %>%
gather(variable, skewness) %>%
arrange(desc(skewness)) %>%
head(5)
# Print skewness of quantitative variables
print("Skewness of quantitative variables:")
## [1] "Skewness of quantitative variables:"
print(skewness)
## variable skewness
## 1 MiscVal 24.426522
## 2 PoolArea 14.797918
## 3 LotArea 12.182615
## 4 X3SsnPorch 10.283178
## 5 LowQualFinSF 8.992833
# Plot the distribution of a highly skewed variable
skewed_variable <- skewness$variable[1]
ggplot(data, aes_string(x = skewed_variable)) +
geom_histogram(bins = 30, fill = "seagreen", alpha = 0.7) +
ggtitle(paste("Distribution of", skewed_variable)) +
theme_minimal()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Define X and Y
X <- train$LotArea
Y <- train$SalePrice
# Print first few values of X and Y
print("X (LotArea):")
## [1] "X (LotArea):"
head(X)
## [1] 8450 9600 11250 9550 14260 14115
print("Y (SalePrice):")
## [1] "Y (SalePrice):"
head(Y)
## [1] 208500 181500 223500 140000 250000 143000
# Distribution of Lot Area, our selected X variable
skewed_variable <- "LotArea"
# Verify the skewness of LotArea
lotarea_skewness <- skewness(X, na.rm = TRUE)
print(paste("Skewness of LotArea:", lotarea_skewness))
## [1] "Skewness of LotArea: 12.182615015142"
# Plot the distribution of LotArea
ggplot(data, aes(x = LotArea)) +
geom_histogram(bins = 30, fill = "pink", alpha = 0.7) +
ggtitle("Distribution of LotArea") +
theme_minimal()
# Calculate the 3rd quartile of X (LotArea)
x_3rd_quartile <- quantile(X, 0.75, na.rm = TRUE)
# Calculate the 2nd quartile (median) of Y (SalePrice)
y_2nd_quartile <- quantile(Y, 0.50, na.rm = TRUE)
# Print the quartiles
print(paste("3rd Quartile of X (LotArea):", x_3rd_quartile))
## [1] "3rd Quartile of X (LotArea): 11601.5"
print(paste("2nd Quartile of Y (SalePrice):", y_2nd_quartile))
## [1] "2nd Quartile of Y (SalePrice): 163000"
# Calculate P(X > x)
P_X_greater_x <- sum(X > x_3rd_quartile) / length(X)
# Calculate P(Y > y)
P_Y_greater_y <- sum(Y > y_2nd_quartile) / length(Y)
# Calculate P(X > x, Y > y)
P_X_greater_x_and_Y_greater_y <- sum(X > x_3rd_quartile & Y > y_2nd_quartile) / length(X)
# Calculate P(X > x | Y > y)
P_X_greater_x_given_Y_greater_y <- P_X_greater_x_and_Y_greater_y / P_Y_greater_y
# Calculate P(X < x | Y > y)
P_X_less_x_given_Y_greater_y <- 1 - P_X_greater_x_given_Y_greater_y
# Print the probabilities with 4 decimal digits
cat(sprintf("P(X > x | Y > y): %.4f\n", P_X_greater_x_given_Y_greater_y))
## P(X > x | Y > y): 0.3791
cat(sprintf("P(X > x, Y > y): %.4f\n", P_X_greater_x_and_Y_greater_y))
## P(X > x, Y > y): 0.1890
cat(sprintf("P(X < x | Y > y): %.4f\n", P_X_less_x_given_Y_greater_y))
## P(X < x | Y > y): 0.6209
# Calculate the counts for the table
counts_table <- table(X <= quantile(X, 0.5), Y <= quantile(Y, 0.5))
# Add row and column names
rownames(counts_table) <- c("<= 3rd quartile", "> 3rd quartile")
colnames(counts_table) <- c("<= 2nd quartile", "> 2nd quartile")
# Add totals
counts_table <- addmargins(counts_table)
# Print the table
print("Counts Table:")
## [1] "Counts Table:"
print(counts_table)
##
## <= 2nd quartile > 2nd quartile Sum
## <= 3rd quartile 477 253 730
## > 3rd quartile 251 479 730
## Sum 728 732 1460
# Calculate the counts for A (observations above the 3rd quartile for X)
count_A <- sum(X > quantile(X, 0.75))
# Calculate the counts for B (observations above the 2nd quartile for Y)
count_B <- sum(Y > quantile(Y, 0.50))
# Calculate the counts for both A and B
count_A_and_B <- sum(X > quantile(X, 0.75) & Y > quantile(Y, 0.50))
# Calculate P(A|B)
P_A_given_B <- count_A_and_B / count_B
# Calculate P(A)
P_A <- count_A / length(X)
# Calculate P(B)
P_B <- count_B / length(Y)
print(P_A_given_B)
## [1] 0.3791209
print(P_A )
## [1] 0.25
print(P_B)
## [1] 0.4986301
# Check if P(A|B) = P(A) * P(B)
if (round(P_A_given_B, 4) == round(P_A * P_B, 4)) {
print("P(A|B) = P(A) * P(B)")
} else {
print("P(A|B) != P(A) * P(B)")
}
## [1] "P(A|B) != P(A) * P(B)"
observed <- c(count_A_and_B, count_A - count_A_and_B, count_B - count_A_and_B, length(X) - (count_A + count_B - count_A_and_B))
expected <- c(P_A_given_B * count_B, (1 - P_A_given_B) * count_B, P_A_given_B * (length(X) - count_B), (1 - P_A_given_B) * (length(X) - count_B))
chisq <- sum((observed - expected)^2 / expected)
# Calculate the degrees of freedom
df <- 1
# Calculate the p-value
p_value <- pchisq(chisq, df, lower.tail = FALSE)
# Print the results
print(paste("Chi-Square statistic:", chisq))
## [1] "Chi-Square statistic: 479.422998091602"
print(paste("Degrees of freedom:", df))
## [1] "Degrees of freedom: 1"
print(paste("p-value:", p_value))
## [1] "p-value: 2.85297622258684e-106"
# Univariate descriptive statistics
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
# Histogram of X (LotArea)
ggplot(train, aes(x = LotArea)) +
geom_histogram(fill = "seagreen", alpha = 0.7) +
ggtitle("Histogram of LotArea") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Histogram of Y (SalePrice)
ggplot(train, aes(x = SalePrice)) +
geom_histogram(bins = 39, fill = "seagreen", alpha = 0.7) +
ggtitle("Histogram of SalePrice") +
theme_minimal()
# Scatterplot of X and Y with
ggplot(data, aes(x = LotArea, y = SalePrice)) +
geom_point(color = "seagreen", alpha = 0.3) + # Set color to blue and alpha to 0.5
ggtitle("Scatterplot of LotArea vs SalePrice") +
theme_minimal()
# Calculate the difference in means
mean_diff <- mean(train$LotArea) - mean(train$SalePrice)
# Calculate standard error
se_diff <- sqrt(var(train$LotArea) / length(train$LotArea) + var(train$SalePrice) / length(train$SalePrice))
# Calculate the 95% confidence interval
ci_lower <- mean_diff - 1.96 * se_diff
ci_upper <- mean_diff + 1.96 * se_diff
# Print the confidence interval
print(paste("95% CI for the difference in means:", ci_lower, "-", ci_upper))
## [1] "95% CI for the difference in means: -174511.45214287 - -166297.283473569"
# Test the hypothesis that the correlation is 0 and provide a 99% confidence interval
# Convert X and Y to numeric, replacing non-numeric values with NA
train <- train %>%
mutate(X = as.numeric(X),
Y = as.numeric(Y))
# Test the hypothesis that the correlation is 0 and provide a 99% confidence interval
cor_test_99 <- cor.test(train$X, train$Y, conf.level = 0.99)
# Print the results of the hypothesis test and confidence interval
print("Hypothesis Test Results (99% confidence level):")
## [1] "Hypothesis Test Results (99% confidence level):"
print(cor_test_99)
##
## Pearson's product-moment correlation
##
## data: train$X and train$Y
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.2000196 0.3254375
## sample estimates:
## cor
## 0.2638434
# Correlation matrix for LotArea and SalePrice
cor_matrix <- cor(train[c("LotArea", "SalePrice")])
# Print the correlation matrix
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
## LotArea SalePrice
## LotArea 1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000
# Test the hypothesis that the correlation is 0
cor_test <- cor.test(train$LotArea, train$SalePrice)
# Print the results of the hypothesis test
print("Hypothesis Test Results:")
## [1] "Hypothesis Test Results:"
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: train$LotArea and train$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2154574 0.3109369
## sample estimates:
## cor
## 0.2638434
The above descriptive and inferential statistics analysis provided insights into the descriptive characteristics of variables X (“LotArea” ) and Y (SalePrice”), allowing us to assess the strength and significance of their relationship.
Univariate Descriptive Statistics and Plots provided summaries of the central tendency, dispersion, and shape of the distributions of “LotArea” and “SalePrice”;
Histograms visualized the distributions of “LotArea” and “SalePrice”, allowing us to understand their spread and identify any potential outliers;
Scatterplot of LotArea and SalePrice visually displays the relationship between “LotArea” and “SalePrice”. Each point representing an observation in the dataset, with “LotArea” on the x-axis and “SalePrice” on the y-axis;
The scatterplot also helps us assess if there is any discernible pattern or trend between “LotArea” and “SalePrice”.
95% Confidence Interval for the Difference in Means:
The confidence interval provides a range of values within which we are 95% confident that the true difference in means between “LotArea” and “SalePrice” lies. If the confidence interval includes zero, it suggests that there is no statistically significant difference between the means of “LotArea” and “SalePrice”.
Correlation Matrix and Hypothesis Testing:
The correlation matrix quantifies the strength and direction of the linear relationship between “LotArea” and “SalePrice”. - A correlation coefficient close to +1 indicates a strong positive linear relationship, close to -1 indicates a strong negative linear relationship, and close to 0 indicates no linear relationship. - The hypothesis test determines whether the observed correlation coefficient is significantly different from zero. A low p-value (typically below the chosen significance level, e.g., 0.05) indicates that the correlation is statistically significant, suggesting a meaningful relationship between “LotArea” and “SalePrice”.
# Load necessary library
library(psych)
# Invert the correlation matrix to obtain the precision matrix
precision_matrix <- solve(cor_matrix)
# Multiply the correlation matrix by the precision matrix
cor_precision_multiply <- cor_matrix %*% precision_matrix
# Multiply the precision matrix by the correlation matrix
precision_cor_multiply <- precision_matrix %*% cor_matrix
# Perform principal component analysis (PCA)
pca_result <- principal(cor_matrix, nfactors = 2, rotate = "none")
# Print the PCA results
print("PCA Results:")
## [1] "PCA Results:"
print(pca_result)
## Principal Components Analysis
## Call: principal(r = cor_matrix, nfactors = 2, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 h2 u2 com
## LotArea 0.79 -0.61 1 1.1e-16 1.9
## SalePrice 0.79 0.61 1 1.1e-16 1.9
##
## PC1 PC2
## SS loadings 1.26 0.74
## Proportion Var 0.63 0.37
## Cumulative Var 0.63 1.00
## Proportion Explained 0.63 0.37
## Cumulative Proportion 0.63 1.00
##
## Mean item complexity = 1.9
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0
##
## Fit based upon off diagonal values = 1
# Interpretation
# The PCA results provide information about the underlying structure of the variables.
# The loadings indicate the correlation between the original variables and the principal components.
# The eigenvalues represent the variance explained by each principal component.
# The proportion of variance explained by each component can be used to assess the contribution of each component to the total variance.
# By examining the loadings and eigenvalues, we can identify which variables contribute most to each principal component and understand the relationships between variables in the dataset.
PCA helps in identifying patterns and relationships between variables in the dataset. It reduces the dimensionality of the data while retaining most of its variability, making it easier to interpret and visualize. By examining the loadings and eigenvalues, we gain insights into the underlying structure of the dataset and the relationships between variables. PCA can be used for data reduction, feature selection, and data visualization, making it a valuable tool in exploratory data analysis and dimensionality reduction techniques.
shifted_LotArea <- train$LotArea - min(train$LotArea) + 1 # Shifted variable with minimum value above zero
head(shifted_LotArea)
## [1] 7151 8301 9951 8251 12961 12816
library(MASS)
fit_exp <- fitdistr(shifted_LotArea, "exponential") # Fit exponential distribution
lambda <- fit_exp$estimate # Optimal lambda value
head(lambda)
## rate
## 0.0001084854
samples_exp <- rexp(1000, lambda) # Generate 1000 samples from exponential distribution
head(samples_exp)
## [1] 16764.94038 7031.50121 28.57506 8234.16895 799.68282 4037.66132
hist(shifted_LotArea, main = "Histogram of Shifted LotArea", xlab = "Shifted LotArea", col = "seagreen", breaks = 30)
hist(samples_exp, main = "Histogram of Samples from Exponential Distribution", xlab = "Samples", col = "pink", breaks = 30)
percentile_5 <- qexp(0.05, rate = lambda) # 5th percentile using exponential distribution
percentile_95 <- qexp(0.95, rate = lambda) # 95th percentile using exponential distribution
percentile_5
## [1] 472.8128
percentile_95
## [1] 27614.15
# Calculate Empirical Percentiles
mean_LotArea <- mean(shifted_LotArea)
sd_LotArea <- sd(shifted_LotArea)
conf_interval <- c(mean_LotArea - 1.96 * sd_LotArea / sqrt(length(shifted_LotArea)),
mean_LotArea + 1.96 * sd_LotArea / sqrt(length(shifted_LotArea)))
# Calculate Empirical Percentiles
empirical_percentile_5 <- quantile(shifted_LotArea, 0.05)
empirical_percentile_95 <- quantile(shifted_LotArea, 0.95)
# Print Results
print("Optimal Lambda for Exponential Distribution:")
## [1] "Optimal Lambda for Exponential Distribution:"
print(lambda)
## rate
## 0.0001084854
print("5th and 95th Percentiles using Exponential Distribution:")
## [1] "5th and 95th Percentiles using Exponential Distribution:"
print(c(percentile_5, percentile_95))
## [1] 472.8128 27614.1451
print("95% Confidence Interval from Empirical Data:")
## [1] "95% Confidence Interval from Empirical Data:"
print(conf_interval)
## [1] 8705.834 9729.823
print("Empirical 5th and 95th Percentiles of Shifted LotArea:")
## [1] "Empirical 5th and 95th Percentiles of Shifted LotArea:"
The results from this section help us understand the distributional properties of the “LotArea” variable and assess the suitability of the exponential distribution as a model. They provide valuable information for making inferences and decisions based on the data. More specifically: - By fitting an exponential probability density function to the shifted “LotArea” variable, we attempted to model its distribution with a known parametric form.The optimal λ value obtained from the fit represents the rate parameter of the exponential distribution, indicating the average rate of occurrence for “LotArea” values. - The comparison between the histograms of the original data and the fitted exponential distribution helps us assess the goodness-of-fit. Deviations between the two histograms may suggest limitations in the assumed distribution.In other words, this visual comparison of the histograms is an easily way to assess how well the exponential distribution fits the empirical data. If the histograms match closely, it suggests a good fit.
data <- read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/main/train_csv.csv")
train = data
# Fit linear regression model
lm_model <- lm(SalePrice ~ LotArea, data = train)
# Get model summary
summary(lm_model)
##
## Call:
## lm(formula = SalePrice ~ LotArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -275668 -48169 -17725 31248 553356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.588e+05 2.915e+03 54.49 <2e-16 ***
## LotArea 2.100e+00 2.011e-01 10.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76650 on 1458 degrees of freedom
## Multiple R-squared: 0.06961, Adjusted R-squared: 0.06898
## F-statistic: 109.1 on 1 and 1458 DF, p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(lm_model)
hist(lm_model$residuals)
A Box-Cox transformation is “an optimization algorithm designed to find the best powers to raise each variable so that the resulting distribution cannot be rejected.”(Prof.Larry Fulton)
# Load necessary libraries
library(MASS)
# Fit initial linear regression model
lm_model <- lm(SalePrice ~ LotArea, data = train)
# Apply Box-Cox transformation
boxcox_result <- boxcox(lm_model, lambda = seq(-2, 2, by = 0.1))
# Find the lambda value that maximizes the log-likelihood
lambda_optimal <- boxcox_result$x[which.max(boxcox_result$y)]
# Display the optimal lambda value
cat("Optimal lambda for Box-Cox transformation:", lambda_optimal, "\n")
## Optimal lambda for Box-Cox transformation: -0.06060606
# Transform the response variable using the optimal lambda
if(lambda_optimal == 0) {
train$SalePrice_transformed <- log(train$SalePrice)
} else {
train$SalePrice_transformed <- (train$SalePrice^lambda_optimal - 1) / lambda_optimal
}
# Fit the linear regression model again using the transformed response variable
lm_model_transformed <- lm(SalePrice_transformed ~ LotArea, data = train)
# Get model summary
summary(lm_model_transformed)
##
## Call:
## lm(formula = SalePrice_transformed ~ LotArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77614 -0.11315 -0.00998 0.11754 0.66770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.484e+00 7.081e-03 1198.1 <2e-16 ***
## LotArea 4.936e-06 4.885e-07 10.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1862 on 1458 degrees of freedom
## Multiple R-squared: 0.06544, Adjusted R-squared: 0.0648
## F-statistic: 102.1 on 1 and 1458 DF, p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(lm_model_transformed )
hist(lm_model_transformed$residuals)
# Fit A multiple regression model
# Load necessary libraries
library(dplyr)
# Load the dataset
data <- read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/main/train_csv.csv")
train = data
# Convert categorical variables to factors
train$KitchenQual <- as.factor(train$KitchenQual)
# Fit the multiple regression model with the specified variables, excluding Neighborhood
lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual+OverallCond+ Neighborhood,data = train)
# Get model summary
summary(lmModel)
##
## Call:
## lm(formula = SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars +
## KitchenQual + OverallCond + Neighborhood, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -333611 -16485 -432 15491 245336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.263e+04 1.203e+04 7.700 2.52e-14 ***
## LotArea 5.993e-01 1.054e-01 5.688 1.56e-08 ***
## GrLivArea 5.782e+01 2.361e+00 24.485 < 2e-16 ***
## PoolArea 1.202e+01 2.377e+01 0.506 0.613094
## GarageCars 1.741e+04 1.721e+03 10.118 < 2e-16 ***
## KitchenQualFa -7.705e+04 7.742e+03 -9.952 < 2e-16 ***
## KitchenQualGd -5.935e+04 4.307e+03 -13.780 < 2e-16 ***
## KitchenQualTA -7.283e+04 4.797e+03 -15.183 < 2e-16 ***
## OverallCond 6.507e+03 9.434e+02 6.898 7.92e-12 ***
## NeighborhoodBlueste -3.764e+04 2.663e+04 -1.414 0.157708
## NeighborhoodBrDale -3.829e+04 1.267e+04 -3.023 0.002550 **
## NeighborhoodBrkSide -3.144e+04 1.025e+04 -3.068 0.002195 **
## NeighborhoodClearCr -7.519e+03 1.141e+04 -0.659 0.510160
## NeighborhoodCollgCr 4.034e+03 9.120e+03 0.442 0.658330
## NeighborhoodCrawfor 2.698e+02 1.031e+04 0.026 0.979126
## NeighborhoodEdwards -3.664e+04 9.698e+03 -3.778 0.000165 ***
## NeighborhoodGilbert -6.373e+03 9.596e+03 -0.664 0.506727
## NeighborhoodIDOTRR -4.518e+04 1.081e+04 -4.181 3.08e-05 ***
## NeighborhoodMeadowV -4.026e+04 1.253e+04 -3.214 0.001338 **
## NeighborhoodMitchel -1.441e+04 1.023e+04 -1.408 0.159460
## NeighborhoodNAmes -2.358e+04 9.268e+03 -2.544 0.011062 *
## NeighborhoodNoRidge 6.332e+04 1.051e+04 6.024 2.17e-09 ***
## NeighborhoodNPkVill -2.397e+04 1.479e+04 -1.621 0.105217
## NeighborhoodNridgHt 5.674e+04 9.713e+03 5.842 6.38e-09 ***
## NeighborhoodNWAmes -1.810e+04 9.835e+03 -1.840 0.065916 .
## NeighborhoodOldTown -5.417e+04 9.641e+03 -5.619 2.31e-08 ***
## NeighborhoodSawyer -2.470e+04 9.895e+03 -2.496 0.012677 *
## NeighborhoodSawyerW -7.948e+03 9.842e+03 -0.808 0.419497
## NeighborhoodSomerst 1.657e+04 9.422e+03 1.759 0.078827 .
## NeighborhoodStoneBr 6.649e+04 1.124e+04 5.913 4.19e-09 ***
## NeighborhoodSWISU -4.508e+04 1.167e+04 -3.864 0.000117 ***
## NeighborhoodTimber 1.475e+04 1.057e+04 1.395 0.163104
## NeighborhoodVeenker 2.401e+04 1.387e+04 1.731 0.083676 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35410 on 1427 degrees of freedom
## Multiple R-squared: 0.8056, Adjusted R-squared: 0.8013
## F-statistic: 184.8 on 32 and 1427 DF, p-value: < 2.2e-16
Residuals:
Min 1Q Median 3Q Max -333611 -16485 -432 15491 245336
The residuals represent the difference between observed and predicted values. The median residual being close to zero suggests that the model is unbiased, though the wide range indicates there are some large discrepancies between predicted and actual sale prices.
Significant Predictors:
LotArea and GrLivArea have a positive effect on SalePrice.
GarageCars significantly increases SalePrice.
Kitchen Quality has a strong impact, with lower qualities (Fair, Good, Typical/Average) significantly reducing the sale price compared to Excellent quality.
Overall Condition positively affects the sale price. Several Neighborhoods significantly affect sale prices, either positively or negatively.
Non-significant Predictors:
PoolArea does not significantly affect the sale price and Many neighborhoods do not have a significant impact compared to the baseline. Model Fit
Our model explains a substantial portion of the variance (80.56%), indicating a good fit. The residual standard error suggests some variability in sale prices is not explained by the model, indicating room for improvement.
Overall, our regression model provides a comprehensive understanding of the factors influencing house prices. Significant predictors like lot area, living area, garage capacity, kitchen quality, and overall condition are crucial determinants of sale prices. Additionally, neighborhood effects highlight the importance of location. The model fits the data well, explaining over 80% of the variance, but there is potential for further refinement and improvement.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
# checking for multicollinearity
# Fit the multiple regression model again
lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual + OverallCond + Neighborhood, data = train)
# Check for multicollinearity using Variance Inflation Factor (VIF)
vif_values <- vif(lmModel)
print(vif_values)
## GVIF Df GVIF^(1/(2*Df))
## LotArea 1.286673 1 1.134316
## GrLivArea 1.791208 1 1.338360
## PoolArea 1.061342 1 1.030215
## GarageCars 1.923633 1 1.386951
## KitchenQual 2.684020 3 1.178866
## OverallCond 1.282228 1 1.132355
## Neighborhood 5.006971 24 1.034128
The multicollinearity check of our model reveals that: - LotArea, GrLivArea, PoolArea, GarageCars, and OverallCond have VIF values close to 1, indicating low multicollinearity. - KitchenQual has a VIF of 2.68, which is slightly above 1, but still within an acceptable range, suggesting moderate multicollinearity. - Neighborhood has a VIF of 5.01, which is higher than the usual threshold of 5, indicating potential multicollinearity issues. This suggests that the Neighborhood variable may have strong linear relationships with other predictor variables in the model. Action: We will apply a regularization technique called box transformation
# Check for heteroscedasticity using the score test
score_test <- ncvTest(lmModel)
print(score_test)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1669.506, Df = 1, p = < 2.22e-16
In this case, the p-value is extremely small (p < 2.22e-16), indicating strong evidence against the null hypothesis of homoscedasticity. Therefore, we reject the null hypothesis and conclude that there is heteroscedasticity present in the model. Because the presence of heteroscedasticity can lead to inefficient and biased estimates of the coefficients, we will apply a box- transformation algorithm to our model at to mitigate heteroscedasticity issue.
# Plot residuals vs fitted values to visually inspect heteroscedasticity
plot(lmModel, which = 1)
# Check for normality of residuals using the Shapiro-Wilk test
shapiro_test_result <- shapiro.test(lmModel$residuals)
print(shapiro_test_result)
##
## Shapiro-Wilk normality test
##
## data: lmModel$residuals
## W = 0.86935, p-value < 2.2e-16
# Plot Q-Q plot of residuals to visually inspect normality
qqnorm(lmModel$residuals)
qqline(lmModel$residuals)
# Plot histogram of residuals
hist(lmModel$residuals, breaks = 30, main = "Histogram of Residuals", xlab = "Residuals")
# Additional diagnostic plots
par(mfrow = c(1, 1))
plot(lmModel)
par(mfrow = c(1, 1))
# Load necessary library
library(car)
# Check if lmModel contains predictor variables
if (length(lmModel$coefficients) > 1) {
# Calculate VIF values
vif_values <- vif(lmModel)
# Print VIF values
print(vif_values)
} else {
print("No predictor variables found in the linear regression model.")
}
## GVIF Df GVIF^(1/(2*Df))
## LotArea 1.286673 1 1.134316
## GrLivArea 1.791208 1 1.338360
## PoolArea 1.061342 1 1.030215
## GarageCars 1.923633 1 1.386951
## KitchenQual 2.684020 3 1.178866
## OverallCond 1.282228 1 1.132355
## Neighborhood 5.006971 24 1.034128
# Visual check for heteroscedasticity
ggplot(train, aes(fitted(lmModel), residuals(lmModel))) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals")
## `geom_smooth()` using formula = 'y ~ x'
# Visual check for normality
qqplot <- ggplot(data.frame(residuals = lmModel$residuals), aes(sample = residuals)) +
stat_qq() +
stat_qq_line() +
labs(title = "Q-Q Plot of Residuals", x = "Theoretical Quantiles", y = "Sample Quantiles")
# Histogram of residuals
histogram <- ggplot(data.frame(residuals = lmModel$residuals), aes(x = residuals)) +
geom_histogram(aes(y = ..density..), bins = 30) +
geom_density(col = "seagreen") +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Density")
# Combine the plots
grid.arrange(qqplot, histogram, ncol = 2)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Fit the linear regression model
lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual + OverallCond + Neighborhood, data = train)
# Determine optimal lambda value for Box-Cox transformation
boxcox_lambda <- boxcox(lmModel)
# Apply Box-Cox transformation using the optimal lambda
transformed_lmModel <- lm(SalePrice ~ LotArea + GrLivArea + PoolArea + GarageCars + KitchenQual + OverallCond + Neighborhood,
data = train,
lambda = boxcox_lambda$x[which.max(boxcox_lambda$y)])
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'lambda' will be disregarded
hist(transformed_lmModel$residuals)