library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
train_data = read.csv('train.csv')
head(train_data,3)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
# TotalBsmtSF is right skewed
ggplot(train_data, aes(x = TotalBsmtSF)) +
geom_histogram(binwidth = 100, fill = "blue", alpha = 0.7) +
labs(title = "TotalBsmtSF Histogram") +
theme_minimal()
# Calculating quartiles
x_3rd_quartile <- quantile(train_data$TotalBsmtSF, 0.75, na.rm = TRUE)
y_2nd_quartile <- quantile(train_data$SalePrice, 0.50, na.rm = TRUE)
print(paste("3rd Quartile of TotalBsmtSF:", x_3rd_quartile))
## [1] "3rd Quartile of TotalBsmtSF: 1298.25"
print(paste("2nd Quartile of SalePrice:", y_2nd_quartile))
## [1] "2nd Quartile of SalePrice: 163000"
# New variables based on quartiles
train_data$TotalBsmtSF_above_3rd <- train_data$TotalBsmtSF > x_3rd_quartile
train_data$SalePrice_above_2nd <- train_data$SalePrice > y_2nd_quartile
# Creating table of counts
counts <- table(train_data$TotalBsmtSF_above_3rd, train_data$SalePrice_above_2nd)
# Calculating probabilities
P_X_gt_x_given_Y_gt_y <- counts[2, 2] / sum(counts[, 2])
P_X_gt_x_and_Y_gt_y <- counts[2, 2] / sum(counts)
P_X_lt_x_given_Y_gt_y <- counts[1, 2] / sum(counts[, 2])
print(paste("P(X > x | Y > y):", P_X_gt_x_given_Y_gt_y))
## [1] "P(X > x | Y > y): 0.451923076923077"
print(paste("P(X > x, Y > y):", P_X_gt_x_and_Y_gt_y))
## [1] "P(X > x, Y > y): 0.225342465753425"
print(paste("P(X < x | Y > y):", P_X_lt_x_given_Y_gt_y))
## [1] "P(X < x | Y > y): 0.548076923076923"
# Creating the contingency table
contingency_table <- table(cut(train_data$TotalBsmtSF, breaks=c(-Inf, x_3rd_quartile, Inf)),
cut(train_data$SalePrice, breaks=c(-Inf, y_2nd_quartile, Inf)))
# Converting to matrix and add row and column totals
contingency_matrix <- addmargins(as.matrix(contingency_table))
# Renaming rows and columns for clarity
rownames(contingency_matrix) <- c("<=3rd quartile", ">3rd quartile", "Total")
colnames(contingency_matrix) <- c("<=2nd quartile", ">2nd quartile", "Total")
# Creating a new matrix with additional row and column for labels
new_matrix <- matrix("", nrow=nrow(contingency_matrix) + 1, ncol=ncol(contingency_matrix) + 1)
new_matrix[1, 2:ncol(new_matrix)] <- c("<=2nd quartile", ">2nd quartile", "Total")
new_matrix[2:nrow(new_matrix), 1] <- c("<=3rd quartile", ">3rd quartile", "Total")
new_matrix[2:nrow(new_matrix), 2:ncol(new_matrix)] <- contingency_matrix
new_matrix[1, 1] <- "X/Y"
# Displaying the table using kable
kable(new_matrix, format = "html", caption = "Contingency Table of TotalBsmtSF and SalePrice with Custom Labels") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
X/Y | <=2nd quartile | >2nd quartile | Total |
<=3rd quartile | 696 | 399 | 1095 |
>3rd quartile | 36 | 329 | 365 |
Total | 732 | 728 | 1460 |
# Createing new variables based on quartiles
train_data$A <- train_data$TotalBsmtSF > x_3rd_quartile
train_data$B <- train_data$SalePrice > y_2nd_quartile
# Calculating probabilities
P_A <- mean(train_data$A)
P_B <- mean(train_data$B)
P_A_given_B <- mean(train_data$A & train_data$B) / P_B
# probabilities
print(paste("P(A):", P_A))
## [1] "P(A): 0.25"
print(paste("P(B):", P_B))
## [1] "P(B): 0.498630136986301"
print(paste("P(A | B):", P_A_given_B))
## [1] "P(A | B): 0.451923076923077"
# Checking if P(A | B) ≈ P(A) * P(B)
P_A_times_P_B <- P_A * P_B
independence_check <- abs(P_A_given_B - P_A_times_P_B) < 0.01 # threshold for checking near equality
print(paste("P(A | B) ≈ P(A) * P(B):", independence_check))
## [1] "P(A | B) ≈ P(A) * P(B): FALSE"
# Chi-Square test for association
contingency_table <- table(train_data$A, train_data$B)
chi_square_test <- chisq.test(contingency_table)
print(chi_square_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contingency_table
## X-squared = 313.61, df = 1, p-value < 2.2e-16
FALSE
.Based on both the mathematical check and the Chi-Square test results, we conclude that the variables \(A\) (TotalBsmtSF > 1298.25) and \(B\) (SalePrice > $163,000) are dependent. There is a significant association between the size of the basement area and the sale price of the house.
# Summary statistics for TotalBsmtSF and SalePrice
summary_stats <- train_data %>%
summarise(
TotalBsmtSF_mean = mean(TotalBsmtSF, na.rm = TRUE),
TotalBsmtSF_median = median(TotalBsmtSF, na.rm = TRUE),
TotalBsmtSF_sd = sd(TotalBsmtSF, na.rm = TRUE),
SalePrice_mean = mean(SalePrice, na.rm = TRUE),
SalePrice_median = median(SalePrice, na.rm = TRUE),
SalePrice_sd = sd(SalePrice, na.rm = TRUE)
)
print(summary_stats)
## TotalBsmtSF_mean TotalBsmtSF_median TotalBsmtSF_sd SalePrice_mean
## 1 1057.429 991.5 438.7053 180921.2
## SalePrice_median SalePrice_sd
## 1 163000 79442.5
# Histograms for TotalBsmtSF and SalePrice
ggplot(train_data, aes(x = TotalBsmtSF)) +
geom_histogram(binwidth = 100, fill = "blue", alpha = 0.7) +
labs(title = "Histogram of TotalBsmtSF") +
theme_minimal()
ggplot(train_data, aes(x = SalePrice)) +
geom_histogram(binwidth = 10000, fill = "green", alpha = 0.7) +
labs(title = "Histogram of SalePrice") +
theme_minimal()
# Boxplots for TotalBsmtSF and SalePrice
ggplot(train_data, aes(y = TotalBsmtSF)) +
geom_boxplot(fill = "blue", alpha = 0.7) +
labs(title = "Boxplot of TotalBsmtSF") +
theme_minimal()
ggplot(train_data, aes(y = SalePrice)) +
geom_boxplot(fill = "green", alpha = 0.7) +
labs(title = "Boxplot of SalePrice") +
theme_minimal()
# Scatterplot of TotalBsmtSF and SalePrice
ggplot(train_data, aes(x = TotalBsmtSF, y = SalePrice)) +
geom_point(alpha = 0.5) +
labs(title = "Scatterplot of TotalBsmtSF and SalePrice",
x = "TotalBsmtSF",
y = "SalePrice") +
theme_minimal()
# 95% Confidence Interval for the mean of TotalBsmtSF
mean_totalbsmt <- mean(train_data$TotalBsmtSF, na.rm = TRUE)
se_totalbsmt <- sd(train_data$TotalBsmtSF, na.rm = TRUE) / sqrt(length(train_data$TotalBsmtSF))
ci_totalbsmt <- mean_totalbsmt + c(-1, 1) * qt(0.975, df = length(train_data$TotalBsmtSF) - 1) * se_totalbsmt
print(paste("95% Confidence Interval for the mean of TotalBsmtSF: (", ci_totalbsmt[1], ", ", ci_totalbsmt[2], ")", sep = ""))
## [1] "95% Confidence Interval for the mean of TotalBsmtSF: (1034.90755354321, 1079.95135056638)"
# 95% Confidence Interval for the mean of SalePrice
mean_saleprice <- mean(train_data$SalePrice, na.rm = TRUE)
se_saleprice <- sd(train_data$SalePrice, na.rm = TRUE) / sqrt(length(train_data$SalePrice))
ci_saleprice <- mean_saleprice + c(-1, 1) * qt(0.975, df = length(train_data$SalePrice) - 1) * se_saleprice
print(paste("95% Confidence Interval for the mean of SalePrice: (", ci_saleprice[1], ", ", ci_saleprice[2], ")", sep = ""))
## [1] "95% Confidence Interval for the mean of SalePrice: (176842.841041085, 184999.550739737)"
# Select another quantitative variable, e.g., GrLivArea
train_data_selected <- train_data %>%
dplyr::select(TotalBsmtSF, SalePrice, GrLivArea)
# Calculate the correlation matrix
cor_matrix <- cor(train_data_selected, use = "complete.obs")
print(cor_matrix)
## TotalBsmtSF SalePrice GrLivArea
## TotalBsmtSF 1.0000000 0.6135806 0.4548682
## SalePrice 0.6135806 1.0000000 0.7086245
## GrLivArea 0.4548682 0.7086245 1.0000000
# Testing the hypothesis that the correlation between TotalBsmtSF and SalePrice is 0
cor_test <- cor.test(train_data$TotalBsmtSF, train_data$SalePrice, conf.level = 0.99)
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: train_data$TotalBsmtSF and train_data$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.5697562 0.6539251
## sample estimates:
## cor
## 0.6135806
# 99% Confidence Interval for the correlation coefficient
cor_ci <- cor_test$conf.int
print(paste("99% Confidence Interval for the correlation coefficient: (", cor_ci[1], ", ", cor_ci[2], ")", sep = ""))
## [1] "99% Confidence Interval for the correlation coefficient: (0.569756242152249, 0.653925081691453)"
Based on the outputs and plots above, we can draw several insights about the relationship between TotalBsmtSF (Total square feet of basement area) and SalePrice (the property’s sale price in dollars):
*TotalBsmtSF: - The histogram shows a right-skewed distribution with most values concentrated between 500 and 1500 sq ft. - The boxplot confirms the right skewness with a few extreme outliers above 4000 sq ft.
SalePrice: - The histogram shows a right-skewed distribution with most values concentrated between $100,000 and $200,000. - The boxplot confirms the right skewness with several extreme outliers above $400,000.
The scatterplot of TotalBsmtSF vs. SalePrice shows a positive linear relationship. As TotalBsmtSF increases, SalePrice also tends to increase, though there is some variability
The analysis of TotalBsmtSF (Total square feet of basement area) and SalePrice reveals several key insights. Both variables are right-skewed with most values concentrated below their means and a few extreme outliers, particularly in SalePrice. There is a strong positive linear relationship between TotalBsmtSF and SalePrice, indicating that larger basement areas are associated with higher sale prices, suggesting buyers value additional basement space. The correlation is statistically significant, with a very low p-value and a strong positive correlation coefficient, and the confidence intervals for the means and the correlation coefficient further support the reliability of these estimates. Understanding this positive relationship can help homeowners and real estate agents make informed decisions about property investments and pricing strategies, and further analysis could explore additional factors influencing sale prices and their interactions with basement area and other home features.
# Inverting the correlation matrix to get the precision matrix
precision_matrix <- solve(cor_matrix)
print("Precision Matrix:")
## [1] "Precision Matrix:"
print(precision_matrix)
## TotalBsmtSF SalePrice GrLivArea
## TotalBsmtSF 1.60588442 -0.9394642 -0.06473842
## SalePrice -0.93946422 2.5582310 -1.38549273
## GrLivArea -0.06473842 -1.3854927 2.01124151
# Multiplying the correlation matrix by the precision matrix
identity_matrix_1 <- cor_matrix %*% precision_matrix
print("Correlation Matrix * Precision Matrix:")
## [1] "Correlation Matrix * Precision Matrix:"
print(identity_matrix_1)
## TotalBsmtSF SalePrice GrLivArea
## TotalBsmtSF 1.000000e+00 0 0.000000e+00
## SalePrice 0.000000e+00 1 -2.220446e-16
## GrLivArea 2.775558e-17 0 1.000000e+00
# Multiplying the precision matrix by the correlation matrix
identity_matrix_2 <- precision_matrix %*% cor_matrix
print("Precision Matrix * Correlation Matrix:")
## [1] "Precision Matrix * Correlation Matrix:"
print(identity_matrix_2)
## TotalBsmtSF SalePrice GrLivArea
## TotalBsmtSF 1.000000e+00 5.551115e-17 1.110223e-16
## SalePrice -1.110223e-16 1.000000e+00 -2.220446e-16
## GrLivArea 0.000000e+00 0.000000e+00 1.000000e+00
# Performing PCA on the selected variables
pca_result <- prcomp(train_data_selected, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.4801 0.7441 0.50553
## Proportion of Variance 0.7302 0.1846 0.08519
## Cumulative Proportion 0.7302 0.9148 1.00000
# Printing the loadings (principal components)
print("PCA Loadings:")
## [1] "PCA Loadings:"
print(pca_result$rotation)
## PC1 PC2 PC3
## TotalBsmtSF 0.5376254 0.7878858 -0.3003246
## SalePrice 0.6182605 -0.1261688 0.7757805
## GrLivArea 0.5733349 -0.6027582 -0.5549502
# Scree plot to visualize the explained variance by each principal component
screeplot(pca_result, type = "lines", main = "Scree Plot")
\[ \begin{bmatrix} 1.60588442 & -0.9394642 & -0.06473842 \\ -0.93946422 & 2.5582310 & -1.38549273 \\ -0.06473842 & -1.3854927 & 2.01124151 \end{bmatrix} \]
The diagonal elements of the precision matrix represent the variance inflation factors (VIF) for each variable, indicating the extent of multicollinearity.
Off-diagonal elements represent the partial correlations between variables after accounting for the influence of other variables.
Correlation Matrix * Precision Matrix: \[ \begin{bmatrix} 1.000000e+00 & 0 & 0 \\ 0 & 1 & -2.220446e-16 \\ 2.775558e-17 & 0 & 1.000000e+00 \end{bmatrix} \]
Precision Matrix * Correlation Matrix: \[ \begin{bmatrix} 1.000000e+00 & 5.551115e-17 & 1.110223e-16 \\ -1.110223e-16 & 1.000000e+00 & -2.220446e-16 \\ 0 & 0 & 1.000000e+00 \end{bmatrix} \]
Both multiplications result in values close to the identity matrix, confirming the correctness of the precision matrix inversion.
The small non-zero off-diagonal elements are due to numerical precision limits.
Importance of Components: \[ \begin{array}{ccc} & \text{PC1} & \text{PC2} & \text{PC3} \\ \text{Standard deviation} & 1.4801 & 0.7441 & 0.50553 \\ \text{Proportion of Variance} & 0.7302 & 0.1846 & 0.08519 \\ \text{Cumulative Proportion} & 0.7302 & 0.9148 & 1.00000 \\ \end{array} \]
PCA Loadings: \[ \begin{array}{cccc} & \text{PC1} & \text{PC2} & \text{PC3} \\ \text{TotalBsmtSF} & 0.5376254 & 0.7878858 & -0.3003246 \\ \text{SalePrice} & 0.6182605 & -0.1261688 & 0.7757805 \\ \text{GrLivArea} & 0.5733349 & -0.6027582 & -0.5549502 \\ \end{array} \]
PC1 captures 73.02% of the variance, indicating it is the most important component.
PC2 captures 18.46% of the variance, and PC3 captures 8.52%, together explaining 91.48% of the variance.
The loadings for PC1 show that all three variables (TotalBsmtSF, SalePrice, and GrLivArea) contribute significantly, with positive coefficients indicating they move together.
PC2 and PC3 help differentiate the variance not captured by PC1, with different weights indicating their unique contributions to explaining the data variance.
Scree Plot: The scree plot shows a clear “elbow” after the first principal component (PC1), indicating that PC1 captures the most significant variance in the data.
The PCA results reveal that the first principal component (PC1)
captures the majority of the variance (73.02%) and is influenced by all
three variables (TotalBsmtSF
, SalePrice
, and
GrLivArea
). This suggests a strong underlying factor that
integrates these three variables, likely related to overall property
size and value.
The precision matrix provides insights into the relationships between variables, showing the variance inflation factors on the diagonal. The matrix multiplications confirm the accuracy of the inversion, showing identity matrices with small numerical precision errors.
The scree plot indicates that retaining the first two principal components (explaining 91.48% of the variance) would be sufficient for a reduced-dimensionality representation of the data.
These analyses provide a deeper understanding of the relationships between the variables, emphasizing the significance of basement area and living area in determining property prices.
# Shift TotalBsmtSF to ensure all values are above zero
min_value <- min(train_data$TotalBsmtSF)
if (min_value <= 0) {
train_data$TotalBsmtSF_shifted <- train_data$TotalBsmtSF - min_value + 1
} else {
train_data$TotalBsmtSF_shifted <- train_data$TotalBsmtSF
}
# Fit an exponential distribution to the shifted TotalBsmtSF data
fit <- fitdistr(train_data$TotalBsmtSF_shifted, "exponential")
lambda <- fit$estimate
print(paste("Estimated lambda:", lambda))
## [1] "Estimated lambda: 0.000944796082590709"
set.seed(05182024)
# Generate 1000 samples from the fitted exponential distribution
samples <- rexp(1000, rate = lambda)
# Plot histogram of the original shifted TotalBsmtSF data
hist(train_data$TotalBsmtSF_shifted, breaks = 30, main = "Histogram of Shifted TotalBsmtSF", xlab = "TotalBsmtSF_shifted")
# Plot histogram of the sampled data
hist(samples, breaks = 30, main = "Histogram of Sampled Data from Exponential Distribution", xlab = "Sampled Data")
# Calculating 5th and 95th percentiles using the exponential distribution's CDF
percentile_5_exp <- qexp(0.05, rate = lambda)
percentile_95_exp <- qexp(0.95, rate = lambda)
# Calculating 5th and 95th percentiles of the empirical data
percentile_5_empirical <- quantile(train_data$TotalBsmtSF_shifted, 0.05)
percentile_95_empirical <- quantile(train_data$TotalBsmtSF_shifted, 0.95)
print(paste("5th percentile (exponential):", percentile_5_exp))
## [1] "5th percentile (exponential): 54.2903334727004"
print(paste("95th percentile (exponential):", percentile_95_exp))
## [1] "95th percentile (exponential): 3170.77126880061"
print(paste("5th percentile (empirical):", percentile_5_empirical))
## [1] "5th percentile (empirical): 520.3"
print(paste("95th percentile (empirical):", percentile_95_empirical))
## [1] "95th percentile (empirical): 1754"
# Generate a 95% confidence interval for the empirical data assuming normality
mean_empirical <- mean(train_data$TotalBsmtSF_shifted)
sd_empirical <- sd(train_data$TotalBsmtSF_shifted)
ci_lower <- mean_empirical - qnorm(0.975) * (sd_empirical / sqrt(length(train_data$TotalBsmtSF_shifted)))
ci_upper <- mean_empirical + qnorm(0.975) * (sd_empirical / sqrt(length(train_data$TotalBsmtSF_shifted)))
print(paste("95% Confidence Interval for the mean of TotalBsmtSF (empirical): (", ci_lower, ", ", ci_upper, ")", sep = ""))
## [1] "95% Confidence Interval for the mean of TotalBsmtSF (empirical): (1035.92623710885, 1080.93266700074)"
Estimated Lambda: \[ \lambda = 0.000944796082590709 \]
The estimated rate parameter (λ) for the exponential distribution is approximately 0.0009448. This parameter describes the rate at which the probability density decreases as the variable increases.
The histogram of the shifted TotalBsmtSF data shows a right-skewed distribution.
The histogram of the sampled data from the fitted exponential distribution should match the shape of the original data if the fit is good which is moderately the case here.
Percentiles from Exponential Distribution:
5th Percentile: 54.29
95th Percentile: 3170.77
Percentiles from Empirical Data:
5th Percentile: 520.3
95th Percentile: 1754
The 5th and 95th percentiles from the empirical data are 520.3 and 1754, respectively, compared to 54.29 and 3170.77 from the exponential distribution. This suggests that the exponential distribution may not perfectly fit the empirical data, particularly at the tails.
95% Confidence Interval for the Mean of TotalBsmtSF (Empirical): \[ (1035.93, 1080.93) \]
The 95% confidence interval for the mean of TotalBsmtSF is between 1035.93 and 1080.93. This interval provides a range within which we are 95% confident that the true mean of the TotalBsmtSF lies, assuming normality.
The analysis reveals several key insights: The estimated rate
parameter (\(\lambda\)) for the
exponential distribution shows a decreasing probability density as the
basement area increases. While the exponential distribution captures the
right skewness of TotalBsmtSF
, the histograms and
percentiles indicate it may not perfectly match the empirical data,
especially at the tails. The 95% confidence interval for the mean
basement area provides a reliable range, consistent with summary
statistics. Understanding the distribution of TotalBsmtSF
aids in assessing variability and central tendency, which is valuable
for real estate analysis and decision-making. Further analysis with
other distributions may be necessary for a more accurate fit.
# Load the data
train_data <- read.csv("train.csv")
test_data <- read.csv("test.csv")
# Identify and handle missing values in both train and test datasets
preprocess_data <- function(data) {
for (col in names(data)) {
if (is.numeric(data[[col]])) {
data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm = TRUE)
} else {
data[[col]][is.na(data[[col]])] <- as.character(mode(data[[col]]))
}
}
return(data)
}
train_data <- preprocess_data(train_data)
test_data <- preprocess_data(test_data)
# Remove target variable 'SalePrice' from training data for dummy variable creation
train_data_no_target <- dplyr::select(train_data, -SalePrice)
# Encode categorical variables using one-hot encoding
dummies <- dummyVars("~ .", data = train_data_no_target)
train_data_encoded <- predict(dummies, newdata = train_data_no_target)
train_data_encoded <- as.data.frame(train_data_encoded)
# Add the target variable back to the encoded training data
train_data_encoded$SalePrice <- train_data$SalePrice
# Encode the test data using the same dummyVars object
test_data_encoded <- predict(dummies, newdata = test_data)
test_data_encoded <- as.data.frame(test_data_encoded)
# Ensure the 'Id' column is included in the test data
test_data_encoded$Id <- test_data$Id
# Ensure both datasets have the same columns
common_columns <- intersect(names(train_data_encoded), names(test_data_encoded))
train_data_encoded <- train_data_encoded[, common_columns]
test_data_encoded <- test_data_encoded[, common_columns]
# Scale the features for PCA
train_data_scaled <- scale(train_data_encoded)
test_data_scaled <- scale(test_data_encoded)
# Perform PCA on the training data
pca <- prcomp(train_data_scaled, center = TRUE, scale. = TRUE)
#summary(pca)
# Plot the explained variance
explained_variance <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
ggplot(data = data.frame(PC = 1:length(explained_variance), Variance = explained_variance), aes(x = PC, y = Variance)) +
geom_line() +
geom_point() +
labs(title = "Explained Variance by Principal Components", x = "Principal Component", y = "Cumulative Proportion of Variance Explained") +
theme_minimal()
# Selecting the number of components that explain a sufficient amount of variance (e.g., 95%)
num_components <- which(explained_variance >= 0.95)[1]
print(paste("Number of components explaining 95% variance:", num_components))
## [1] "Number of components explaining 95% variance: 173"
# Transform the training data using the selected number of principal components
train_data_pca <- as.data.frame(pca$x[, 1:num_components])
train_data_pca$SalePrice <- train_data$SalePrice
# Apply the same transformation to the test data
test_data_pca <- as.data.frame(predict(pca, newdata = test_data_scaled)[, 1:num_components])
test_data_pca$Id <- test_data$Id
# Build a linear regression model using the principal components
model_pca <- lm(SalePrice ~ ., data = train_data_pca)
model_summary <- summary(model_pca)
# Extract the required metrics
residual_standard_error <- model_summary$sigma
multiple_r_squared <- model_summary$r.squared
adjusted_r_squared <- model_summary$adj.r.squared
f_statistic <- model_summary$fstatistic[1]
df1 <- model_summary$fstatistic[2]
df2 <- model_summary$fstatistic[3]
p_value <- pf(f_statistic, df1, df2, lower.tail = FALSE)
# Create a list of the extracted metrics
metrics_list <- list(
Residual_Standard_Error = residual_standard_error,
Multiple_R_Squared = multiple_r_squared,
Adjusted_R_Squared = adjusted_r_squared,
F_Statistic = f_statistic,
Degrees_of_Freedom1 = df1,
Degrees_of_Freedom2 = df2,
P_Value = p_value
)
# Print the list of metrics
print(metrics_list)
## $Residual_Standard_Error
## [1] 26993.96
##
## $Multiple_R_Squared
## [1] 0.8982316
##
## $Adjusted_R_Squared
## [1] 0.8845411
##
## $F_Statistic
## value
## 65.61001
##
## $Degrees_of_Freedom1
## numdf
## 173
##
## $Degrees_of_Freedom2
## dendf
## 1286
##
## $P_Value
## value
## 0
# Check for multicollinearity using VIF
vif_values_pca <- vif(model_pca)
head(vif_values_pca)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 1 1 1 1 1
# Predict on the test data
predictions <- predict(model_pca, newdata = test_data_pca)
# Ensure predictions are not negative (SalePrice should be >= 0)
predictions <- pmax(predictions, 0)
# Prepare the submission dataframe
submission <- data.frame(Id = test_data_pca$Id, SalePrice = predictions)
head(submission)
## Id SalePrice
## 1 1461 125850.6
## 2 1462 157995.8
## 3 1463 185727.0
## 4 1464 192724.1
## 5 1465 201799.2
## 6 1466 164076.6
# Write the submission file to CSV
write.csv(submission, "submission.csv", row.names = FALSE)