# Load libraries
library(dplyr)
library(car)
# Load the dataset
data <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data605/main/train.csv", header = TRUE)
# Identify quantitative columns and exclude 'Id' and 'SalePrice'
quantitative_columns <- data %>%
select_if(is.numeric) %>%
select(-Id, -SalePrice)
# Compute the correlation of these columns with 'SalePrice'
correlation_matrix <- cor(data %>% select_if(is.numeric))
correlations <- correlation_matrix[, "SalePrice"]
correlations <- sort(correlations, decreasing = TRUE)
# Remove 'SalePrice' from the list of features
correlations <- correlations[names(correlations) != "SalePrice"]
# Display the top 3 features with the highest correlation with 'SalePrice'
top_3_features <- names(correlations)[1:3]
top_3_scores <- correlations[1:3]
print(top_3_scores)
## OverallQual GrLivArea GarageCars
## 0.7909816 0.7086245 0.6404092
# Plot the correlations
pairs(data[c(top_3_features, "SalePrice")],
main = "Correlation Matrix",
pch = 19)
# Plot individual feature correlations with SalePrice
par(mfrow = c(1, 3))
for (feature in top_3_features) {
plot(data[[feature]], data$SalePrice,
main = paste(feature, "vs SalePrice"),
xlab = feature,
ylab = "SalePrice",
pch = 19)
abline(lm(data$SalePrice ~ data[[feature]]), col = "red") # Add a regression line
}
# Build the regression model using the top 3 features
formula <- as.formula(paste("SalePrice ~", paste(top_3_features, collapse = " + ")))
model <- lm(formula, data = data)
# Perform an F-test
anova_results <- anova(model)
print(anova_results)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## OverallQual 1 5.7609e+12 5.7609e+12 3491.16 < 2.2e-16 ***
## GrLivArea 1 8.1510e+11 8.1510e+11 493.95 < 2.2e-16 ***
## GarageCars 1 2.2924e+11 2.2924e+11 138.92 < 2.2e-16 ***
## Residuals 1456 2.4026e+12 1.6502e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## OverallQual GrLivArea GarageCars
## 1.944062 1.589697 1.612447
All three predictors (OverallQual, GrLivArea, and GarageCars) are highly significant predictors of SalePrice with extremely low p-values (< 2.2e-16).
Multicollinearity: The VIF values for the predictors are all below 2, suggesting that there is no significant multicollinearity among the predictors.
The F-values indicate the relative importance of each predictor in explaining the variability in SalePrice, with OverallQual being the most significant, followed by GrLivArea and GarageCars.
# Function to predict new data
predict_price <- function(new_data) {
predict(model, newdata = new_data)
}
# Output results
list(
model_summary = summary(model),
anova_results = anova_results,
vif_values = vif_values,
prediction_function = predict_price
)
## $model_summary
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340718 -21675 -2085 19500 300177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -98832.493 4842.897 -20.41 <2e-16 ***
## OverallQual 27104.826 1072.182 25.28 <2e-16 ***
## GrLivArea 50.674 2.552 19.86 <2e-16 ***
## GarageCars 21298.960 1807.065 11.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40620 on 1456 degrees of freedom
## Multiple R-squared: 0.7391, Adjusted R-squared: 0.7385
## F-statistic: 1375 on 3 and 1456 DF, p-value: < 2.2e-16
##
##
## $anova_results
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## OverallQual 1 5.7609e+12 5.7609e+12 3491.16 < 2.2e-16 ***
## GrLivArea 1 8.1510e+11 8.1510e+11 493.95 < 2.2e-16 ***
## GarageCars 1 2.2924e+11 2.2924e+11 138.92 < 2.2e-16 ***
## Residuals 1456 2.4026e+12 1.6502e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $vif_values
## OverallQual GrLivArea GarageCars
## 1.944062 1.589697 1.612447
##
## $prediction_function
## function(new_data) {
## predict(model, newdata = new_data)
## }
The model explains a significant portion (73.91%) of the variability in SalePrice, indicating strong predictive power.
All predictors (OverallQual, GrLivArea, and GarageCars) are highly significant and contribute meaningfully to predicting SalePrice.
The residuals suggest some variability in predictions, with a standard error of $40,620 in sale price
Multicollinearity: VIF values indicate no significant multicollinearity issues.
# Load the test dataset
test_data <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data605/main/test.csv", header = TRUE)
# Define the top 3 features from the training model (OverallQual, GrLivArea, and GarageCars)
top_3_features <- c("OverallQual", "GrLivArea", "GarageCars")
# Load the training dataset
train_data <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data605/main/train.csv", header = TRUE)
# Impute missing values in the test dataset with the median of the training dataset
for (feature in top_3_features) {
if (any(is.na(test_data[[feature]]))) {
median_value <- median(train_data[[feature]], na.rm = TRUE)
test_data[[feature]][is.na(test_data[[feature]])] <- median_value
}
}
# Build the regression model using the top 3 features
formula <- as.formula(paste("SalePrice ~", paste(top_3_features, collapse = " + ")))
model <- lm(formula, data = train_data)
# Function to predict SalePrice using the trained model
predict_saleprice <- function(model, new_data) {
predict(model, newdata = new_data)
}
# Predict SalePrice for the test dataset
test_data$SalePrice <- predict_saleprice(model, test_data)
# Create the output dataframe with Id and predicted SalePrice
output <- test_data %>%
select(Id, SalePrice)
#Export the data dataframe to a CSV file
#write.csv(output, "submission.csv", row.names = FALSE)
Kaggle Username: Johnny Rodriguez
Score: 0.55520
# Load libraries
library(png)
library(grid)
# PNG image
img_url <- "https://github.com/johnnydrodriguez/data605/raw/aba02206339c8f92254d2d30e940ed0b1ebe3a3c/kaggle.png"
# Temporary file path
temp_file <- tempfile(fileext = ".png")
# Download image
download.file(img_url, temp_file, mode = "wb")
# Read image
img <- readPNG(temp_file)
# Display image
grid.raster(img, width = unit(1, "npc"), height = unit(.8, "npc"))
From the training dataset, I selected the following x & y variables. TotalBsmtSF shows right skewness.
## Id TotalBsmtSF GrLivArea
## 1455 1455 1221 1221
## 1456 1456 953 1647
## 1457 1457 1542 2073
## 1458 1458 1152 2340
## 1459 1459 1078 1078
## 1460 1460 1256 1256
# Plot the skewness of TotalBsmtSF
p1 <- ggplot(data, aes(x = TotalBsmtSF)) +
geom_histogram(binwidth = 50, fill = "blue", color = "black") +
ggtitle("Distribution of TotalBsmtSF") +
xlab("Total Basement SF") +
ylab("Frequency") +
theme_minimal()
# Plot the skewness of GrLivArea
p2 <- ggplot(data, aes(x = GrLivArea)) +
geom_histogram(binwidth = 100, fill = "green", color = "black") +
ggtitle("Distribution of GrLivArea") +
xlab("GrLivArea") +
ylab("Frequency") +
theme_minimal()
# Plot
grid.arrange(p1, p2, ncol = 2)
# Define the x and y variables
x_variable <- "TotalBsmtSF"
y_variable <- "GrLivArea"
# Calculate the 3rd quartile for TotalBsmtSF and the 2nd quartile for GrLivArea
third_quartile_x <- quantile(data[[x_variable]], 0.75)
second_quartile_y <- quantile(data[[y_variable]], 0.50)
# Categorize the x and y variables
data <- data %>%
mutate(
x_category = ifelse(TotalBsmtSF > third_quartile_x, ">3rd quartile", "<=3rd quartile"),
y_category = ifelse(GrLivArea > second_quartile_y, ">2nd quartile", "<=2nd quartile")
)
# Create a contingency table
contingency_table <- table(data$x_category, data$y_category)
contingency_table <- addmargins(contingency_table)
# Print contingency table
print(contingency_table)
##
## <=2nd quartile >2nd quartile Sum
## <=3rd quartile 659 436 1095
## >3rd quartile 72 293 365
## Sum 731 729 1460
This is th probability that the total basement area is greater than the 3rd quartile given that the living area above ground is greater than the median. This probability helps understand the likelihood of having an unusually large basement area among houses with a larger than average above-ground living area.
We calculate the probability using:
\[ P(X > x \mid Y > y) = \frac{P(X > x \cap Y > y)}{P(Y > y)} \]
P(X>x∣Y>y) = 0.4019204
There is approximately a 40.19% chance that a property has a total basement area greater than the 3rd quartile, given that the above-ground living area is greater than the median. This suggests a moderate likelihood that properties with larger living areas also have larger basements.
This is the probability that both the total basement area is greater than the 3rd quartile and the living area above ground is greater than the median. This joint probability yields the proportion of houses that have both a large basement area and a large above-ground living area.
We calculate the probability using:
\[ P(X > x, Y > y) = \frac{\text{Count}(X > x \cap Y > y)}{\text{Total Count}} \]
P(X>x,Y>y) = 0.2006849
There is approximately a 20.07% chance that a property has both a total basement area greater than the 3rd quartile and an above-ground living area greater than the median. This joint probability indicates the proportion of properties in the dataset that are both larger in terms of basement and living area. It shows that about 1 in 5 properties meet both criteria.
This is the probability that the total basement area is less than the 3rd quartile given that the living area above ground is greater than the median. This probability yields the proportion of houses having a smaller basement area among houses with a larger than average above-ground living area.
We calculate the probability using:
\[ P(X < x \mid Y > y) = \frac{P(X < x \cap Y > y)}{P(Y > y)} \]
P(X<x∣Y>y) = 0.5980796
There is approximately a 59.81% chance that a property has a total basement area less than or equal to the 3rd quartile, given that the above-ground living area is greater than the median. This suggests that many properties with larger living areas do not have large basements. This could indicate that for most larger homes, the basement area is not necessarily proportionally large.
# Extract necessary counts
count_X_greater_x_Y_greater_y <- contingency_table[">3rd quartile", ">2nd quartile"]
count_Y_greater_y <- contingency_table["Sum", ">2nd quartile"]
total_count <- contingency_table["Sum", "Sum"]
count_X_less_x_Y_greater_y <- contingency_table["<=3rd quartile", ">2nd quartile"]
# Calculate probabilities
P_X_greater_x_given_Y_greater_y <- count_X_greater_x_Y_greater_y / count_Y_greater_y
P_X_greater_x_and_Y_greater_y <- count_X_greater_x_Y_greater_y / total_count
P_X_less_x_given_Y_greater_y <- count_X_less_x_Y_greater_y / count_Y_greater_y
# Print probabilities
cat("P(X > x | Y > y) =", P_X_greater_x_given_Y_greater_y, "\n")
## P(X > x | Y > y) = 0.4019204
## P(X > x, Y > y) = 0.2006849
## P(X < x | Y > y) = 0.5980796
We are interested in determining whether \(P(A \mid B) = P(A)P(B)\). If these probabilities are equal, it implies the variables are independent. If the probabilities are not equal, it implies the size of of the total living area is not independent of the basement size.
Let:
\[ P(A) = \frac{\text{Number of observations where } X > x}{\text{Total number of observations}} \]
\[ P(B) = \frac{\text{Number of observations where } Y > y}{\text{Total number of observations}} \]
\[ P(A \mid B) = \frac{\text{Number of observations where } X > x \text{ and } Y > y}{\text{Number of observations where } Y > y} \]
\(P(A \mid B) = 0.4019204\): There is approximately a 40.19% chance that a property has a total basement area greater than the 3rd quartile, given that the above-ground living area is greater than the median.
\(P(A) = 0.25\): The probability that a property has a total basement area greater than the 3rd quartile.
\(P(B) = 0.5\): The probability that a property has an above-ground living area greater than the median.
\(P(A)P(B) = 0.125\): The expected probability of a property having both a large basement area (greater than the 3rd quartile) and a large above-ground living area (greater than the median) if the two features were independent of each other.
Therefore \(P(A \mid B) \neq P(A)P(B)\)
We can conclude that events \(A\) and \(B\) are not independent. This suggests that there is a relationship between having a larger basement area and having a larger above-ground living area.
# Calculate the 3rd quartile for TotalBsmtSF and the 2nd quartile for GrLivArea
third_quartile_x <- quantile(data[[x_variable]], 0.75)
second_quartile_y <- quantile(data[[y_variable]], 0.50)
#Categorize the x and y variables
data <- data %>%
mutate(
x_above_3rd_quartile = TotalBsmtSF > third_quartile_x,
y_above_2nd_quartile = GrLivArea > second_quartile_y
)
# Calculate counts
count_X_above_3rd_quartile <- sum(data$x_above_3rd_quartile)
count_Y_above_2nd_quartile <- sum(data$y_above_2nd_quartile)
count_X_and_Y_above <- sum(data$x_above_3rd_quartile & data$y_above_2nd_quartile)
total_count <- nrow(data)
# Calculate probabilities
P_A <- count_X_above_3rd_quartile / total_count
P_B <- count_Y_above_2nd_quartile / total_count
P_A_given_B <- count_X_and_Y_above / count_Y_above_2nd_quartile
# Compare P(A|B) with P(A)P(B)
P_A_times_P_B <- P_A * P_B
# Print results
cat("P(A) =", P_A, "\n")
## P(A) = 0.25
## P(B) = 0.4993151
## P(A|B) = 0.4019204
## P(A)P(B) = 0.1248288
Chi-Squared Statistic: 177.61 A large value, as in this, indicates a greater deviation from the expected counts, suggesting that the observed counts are not what we would expect if the two variables were independent.
Degrees of Freedom (df): 1 For a 2x2 contingency table, the degrees of freedom are calculated as (rows−1)×(columns−1). In this case, with two categories for each variable (above/below the quartiles), the degrees of freedom are 1.
p-value < 2.2e-16 The very small p-value (much less than 0.05) suggests strong evidence against the null hypothesis - which is that these variables are independent.
Conclusion Since the p-value is significantly less than 0.05, we reject the null hypothesis of independence. This indicates that there is a significant association between the total basement area (TotalBsmtSF) and the above-ground living area (GrLivArea). The result suggests that having a larger basement area is not independent of having a larger above-ground living area; there is a relationship between these two variables.
# Perform the Chi-Squared test
chi_squared_test <- chisq.test(contingency_table)
# Print the results
print(chi_squared_test)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 179.22, df = 4, p-value < 2.2e-16
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 795.8 991.5 1057.4 1298.2 6110.0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
# Create histograms
hist_totalbsmt <- ggplot(data, aes(x = TotalBsmtSF)) +
geom_histogram(binwidth = 50, fill = "blue", color = "black") +
ggtitle("Distribution of TotalBsmtSF")
hist_grlivarea <- ggplot(data, aes(x = GrLivArea)) +
geom_histogram(binwidth = 50, fill = "green", color = "black") +
ggtitle("Distribution of GrLivArea")
# Arrange the plots side by side
grid.arrange(hist_totalbsmt, hist_grlivarea, ncol = 2)
Positive Correlation: The plot shows a generally positive trend, indicating that as the total basement area increases, the above grade living area also tends to increase. This suggests a positive correlation between the two variables.
Concentration of Data Points: There is a high concentration of data points in the lower ranges of both TotalBsmtSF and GrLivArea. Most houses have a TotalBsmtSF and GrLivArea below 2000 square feet. This clustering suggests that the majority of houses in the dataset have similar basement and living area sizes.
Outliers: There are a few noticeable outliers, particularly one house with a very large basement area (around 6000 square feet) and another with a large above grade living area (over 5000 square feet). These outliers can significantly influence the correlation and mean statistics.
Linear Relationship: The linear pattern in the scatterplot suggests that a linear model might be appropriate to describe the relationship between TotalBsmtSF and GrLivArea.
ggplot(data, aes(x = TotalBsmtSF, y = GrLivArea)) +
geom_point() +
ggtitle("Scatterplot of TotalBsmtSF vs GrLivArea") +
xlab("Total Basement SF") +
ylab("Above Grade Living Area SF")
Negative Difference in Means: The confidence interval for the difference in means is entirely negative, indicating that the mean value of GrLivArea is greater than the mean value of TotalBsmtSF. The mean of GrLivArea exceeds the mean of TotalBsmtSF by a value between approximately 422.91 and 493.16 square feet.
Statistical Significance: Since the confidence interval does not include 0, it suggests that there is a statistically significant difference between the mean values of GrLivArea and TotalBsmtSF at the 95% confidence level. The difference is not likely due to random chance.
Conclusion The above grade living area (GrLivArea) is consistently larger than the basement area (TotalBsmtSF) across the dataset. This is intuitive as houses generally have more living space above ground than in the basement.
## [1] -493.1624 -422.9061
## attr(,"conf.level")
## [1] 0.95
Correlation Coefficient for TotalBsmtSF and GrLivArea: 0.4548682
The correlation coefficient between TotalBsmtSF and GrLivArea is approximately 0.4549. This indicates a moderate positive correlation between the two variables. As the total basement area increases, the above grade living area tends to increase as well – but not perfectly and likely not in tandem.
The relationship between TotalBsmtSF and GrLivArea is positive; larger basements are associated with larger above grade living areas. But a correlation coefficient of only 0.4549 suggests a moderate relationship. While there is a positive association, other factors might also influence the GrLivArea besides the TotalBsmtSF.
# Select columns
data_selected <- data[, c("TotalBsmtSF", "GrLivArea")]
# Calculate correlation matrix
cor_matrix <- cor(data_selected)
# Print correlation matrix
print(cor_matrix)
## TotalBsmtSF GrLivArea
## TotalBsmtSF 1.0000000 0.4548682
## GrLivArea 0.4548682 1.0000000
P-value: 1.85787 ×10e^−75
The extremely small p-value indicates that the null hypothesis - stating that there is no correlation between TotalBsmtSF and GrLivArea – can be rejected. There is a statistically significant correlation between the two variables.
99% Confidence Interval: [0.3997401, 0.5067175]
This interval does not include 0, further confirming that the correlation is statistically significant.
The confidence interval is relatively narrow and entirely positive, indicating that we can be 99% confident that the true correlation coefficient between TotalBsmtSF and GrLivArea lies between 0.3997 and 0.5067. This reinforces the conclusion that there is a moderate positive correlation between the two variables.
The correlation coefficient falls within the moderate range, indicating a moderate positive relationship between TotalBsmtSF and GrLivArea. In general, houses with larger basement areas tend to have larger above grade living areas.
Confidence Level: 99% & Summary
The hypothesis test results indicate a significant correlation between TotalBsmtSF and GrLivArea, with a p-value close to zero. The 99% confidence interval for the correlation coefficient[0.3997,0.5067] confirms that there is a moderate positive correlation between the two variables. We can conclude that as the total basement area increases, the above grade living area also tends to increase; this relationship is statistically significant and unlikely to be due to random variation.
# Test the correlation between TotalBsmtSF and GrLivArea
cor_test_result <- cor.test(data$TotalBsmtSF, data$GrLivArea, conf.level = 0.99)
# Extract the p-value
p_value <- cor_test_result$p.value
# Extract 99% confidence interval
conf_interval_99 <- cor_test_result$conf.int
# Display results
p_value
## [1] 1.85787e-75
## [1] 0.3997401 0.5067175
## attr(,"conf.level")
## [1] 0.99
\[ \begin{bmatrix} 1.0000000 & 0.4548682 \\ 0.4548682 & 1.0000000 \\ \end{bmatrix} \]
\[ \begin{bmatrix} 1.2608831 & -0.5735356 \\ -0.5735356 & 1.2608831 \\ \end{bmatrix} \]
\[ \begin{bmatrix} 1 & -2.620121 \times 10^{-17} \\ 0 & 1.000000 \times 10^{0} \\ \end{bmatrix} \]
\[ \begin{bmatrix} 1.000000 \times 10^{0} & 0 \\ -2.620121 \times 10^{-17} & 1 \\ \end{bmatrix} \]
## TotalBsmtSF GrLivArea
## TotalBsmtSF 1.0000000 0.4548682
## GrLivArea 0.4548682 1.0000000
# Invert the correlation matrix to get the precision matrix
precision_matrix <- solve(cor_matrix)
precision_matrix
## TotalBsmtSF GrLivArea
## TotalBsmtSF 1.2608831 -0.5735356
## GrLivArea -0.5735356 1.2608831
# Multiply correlation matrix by precision matrix
cor_times_precision <- cor_matrix %*% precision_matrix
cor_times_precision
## TotalBsmtSF GrLivArea
## TotalBsmtSF 1 -2.620121e-17
## GrLivArea 0 1.000000e+00
# Multiply precision matrix by correlation matrix
precision_times_cor <- precision_matrix %*% cor_matrix
precision_times_cor
## TotalBsmtSF GrLivArea
## TotalBsmtSF 1.000000e+00 0
## GrLivArea -2.620121e-17 1
PC1 captures 75% of the variance, indicating that most of the variability in TotalBsmtSF and GrLivArea can be summarized by this single component. This suggests that PC1 is a strong indicator of the overall size of the houses in terms of both basement and above-grade living areas.
PC2 captures the remaining 25% of the variance, indicating additional variability that is not captured by PC1. PC2 accounts for differences between the variables that are not explained by the overall size, such as variations where some houses may have larger basements relative to their living areas or vice versa.
Cumulative Variance:
# Perform PCA using prcomp
pca_result <- prcomp(data %>% select(TotalBsmtSF, GrLivArea), scale. = TRUE)
# Summary stats for PCA scores
pca_summary <- summary(pca_result)
# Proportion of variance explained by each component
explained_variance <- pca_summary$importance[2,]
cumulative_variance <- pca_summary$importance[3,]
# Print the explained variance
cat("Proportion of Variance Explained by Each Component:\n")
## Proportion of Variance Explained by Each Component:
## PC1 PC2
## 0.72743 0.27257
##
## Cumulative Variance Explained by Each Component:
## PC1 PC2
## 0.72743 1.00000
Loadings:
TotalBsmtSF: The arrow for TotalBsmtSF points primarily along the direction of PC1, indicating that TotalBsmtSF has a strong contribution to PC1. The length and direction of the arrow suggest that houses with larger basement areas tend to have higher PC1 scores.
GrLivArea: The arrow for GrLivArea also points along the direction of PC1 but at a slightly different angle, suggesting that GrLivArea has a strong contribution to PC1, similar to TotalBsmtSF. The length and direction of the arrow indicate that houses with larger above-grade living areas also tend to have higher PC1 scores.
Angle Between Arrows:
Conclusion - PC1 captures the majority of the variance in the dataset and is primarily influenced by both TotalBsmtSF and GrLivArea. Houses with high PC1 scores are those with both large basement areas and large above-grade living areas. PC1 may be interpreted as an overall size factor of the house.
PC2 is residual variance. PC2 might capture differences where the relative sizes of the basement and living areas vary - such as houses with relatively larger basements but smaller living areas and vice versa).
PC1 is the most significant principal component, capturing the largest proportion of the variance in the dataset. It represents the combined effect of TotalBsmtSF and GrLivArea, making it a good indicator of the overall size of the house. The positive correlation between TotalBsmtSF and GrLivArea is evident from the small angle between their arrows. This means that larger basements are typically associated with larger above-grade living areas.
# PCA
pca_result <- principal(data %>% select(TotalBsmtSF, GrLivArea), nfactors = 2, rotate = "none")
# Biplot
plot(pca_result$scores[,1], pca_result$scores[,2],
xlab = "PC1", ylab = "PC2",
main = "PCA Biplot",
pch = 20, col = "blue")
arrows(0, 0, pca_result$loadings[,1] * max(pca_result$scores[,1]),
pca_result$loadings[,2] * max(pca_result$scores[,2]),
col = "red", lwd = 1, length = 0.1)
text(pca_result$loadings[,1] * max(pca_result$scores[,1]),
pca_result$loadings[,2] * max(pca_result$scores[,2]),
labels = rownames(pca_result$loadings),
col = "red", pos = 3, cex = 1.2)
The scree plot shows that:
Variance Explained by PC1:
Almost 1.5 Units of Variance represents the proportion of total variance in the TotalBsmtSF and GrLivArea data that is captured by PC1. Since we are using mean = 0, variance = 1 for each variable, the total variance to be explained by all principal components combined is equal to the number of variables, which is 2 in this case (TotalBsmtSF and GrLivArea).
PC1 captures 1.5 out of the total 2 units of variance, which means PC1 explains 75% of the total variance in the data.
Variance Explained by PC2:
About 0.5 Units of Variance represents the additional variance in the dataset captured by PC2. PC2 captures 0.5 out of the total 2 units of variance, which mexplains 25% of the total variance in the data.
Conclusion*
PC1 is the dominant component, explaining 75% of the total variance in TotalBsmtSF and GrLivArea. This suggests that most of the variability in the data is captured by a single underlying factor that combines both TotalBsmtSF and GrLivArea. We can interpret this as houses with larger TotalBsmtSF are likely to also have larger GrLivArea, and this combined size dimension is what PC1 primarily represents.
PC2 explains the remaining 25% of the total variance and represents residual variance: PC2 captures the additional variability in TotalBsmtSF and GrLivArea that is not explained by PC1. In practical term. PC2 might capture differences where the basement size and the above-grade living area size do not move together.
# Scree plot
scree_data <- data.frame(
Component = 1:length(pca_result$values),
Variance = pca_result$values
)
ggplot(scree_data, aes(x = Component, y = Variance)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_line(aes(y = Variance), color = "red") +
geom_point(aes(y = Variance), color = "red") +
ggtitle("Scree Plot") +
xlab("Principal Components") +
ylab("Variance Explained")
library(MASS)
# Shift TotalBsmtSF to minimum value above zero
data <- data %>% mutate(ShiftedTotalBsmtSF = TotalBsmtSF - min(TotalBsmtSF) + 1)
# Fit exponential distribution
fit <- fitdistr(data$ShiftedTotalBsmtSF, "exponential")
# Extract the optimal value of lambda
lambda <- fit$estimate
# Set seed
set.seed(123)
# Take 1000 samples from the exponential distribution
samples <- rexp(1000, rate = lambda)
# Set up plotting area for side by side plots
par(mfrow = c(1, 2))
# Plot histogram of shifted
hist(data$ShiftedTotalBsmtSF, breaks = 30, probability = TRUE, main = "Histogram of Shifted TotalBsmtSF",
xlab = "Shifted TotalBsmtSF", col = "blue", border = "black")
# Add density curve
lines(density(data$ShiftedTotalBsmtSF), col = "red")
# Plot histogram of exponential distribution
hist(samples, breaks = 30, probability = TRUE, main = "Histogram of Exponential Samples",
xlab = "Exponential Samples", col = "blue", border = "black")
# Add density curve
lines(density(samples), col = "red")
Confidence Interval: [1035.926, 1080.933]
This confidence interval indicates that we are 95% confident that the true mean of the shifted TotalBsmtSF falls between approximately 1036 and 1081. This interval assumes the data follows a normal distribution. For TotalBsmtSF, which represents the total basement area of houses, this interval provides a range for the average basement size in the dataset, adjusted to ensure all values are positive.
# Calculate the 95% Confidence
# Shift TotalBsmtSF to minimum value above zero
data <- data %>% mutate(ShiftedTotalBsmtSF = TotalBsmtSF - min(TotalBsmtSF) + 1)
# Calculate mean and standard deviation of the shifted TotalBsmtSF
mean_shifted <- mean(data$ShiftedTotalBsmtSF)
sd_shifted <- sd(data$ShiftedTotalBsmtSF)
# Calculate the 95% confidence
ci_lower <- mean_shifted - qnorm(0.975) * sd_shifted / sqrt(length(data$ShiftedTotalBsmtSF))
ci_upper <- mean_shifted + qnorm(0.975) * sd_shifted / sqrt(length(data$ShiftedTotalBsmtSF))
ci_lower
## [1] 1035.926
## [1] 1080.933
The exponential distribution can be used to model skewed data. For TotalBsmtSF, it may fit well if the data is heavily right-skewed, with many small values and few large ones. Comparison against the empirical results will indicate if the exponential CDF fits the TotalBsmtSF data.
# Calculate the 5th and 95th percentiles using the exponential CDF
p5_exp <- qexp(0.05, rate = lambda)
p95_exp <- qexp(0.95, rate = lambda)
p5_exp
## [1] 54.29033
## [1] 3170.771
These percentiles are based directly on the observed data and do not assume any specific distribution. In comparison to the exponential CDF, there is a large difference in the percentile values.
# Calculate the empirical 5th and 95th percentiles of the shifted TotalBsmtSF
p5_empirical <- quantile(data$ShiftedTotalBsmtSF, 0.05)
p95_empirical <- quantile(data$ShiftedTotalBsmtSF, 0.95)
p5_empirical
## 5%
## 520.3
## 95%
## 1754
Conclusion
The Exponential Distribution can be used to model data that is heavily right-skewed, where the data has a long tail to the right. The significant difference between the exponential and empirical percentiles suggests that while the data is right-skewed (and possibly only slighlty), the exponential distribution may overestimate the tail behavior.
The Empirical Percentiles are calculated directly from the data, providing a more accurate representation of the data’s distribution without assuming any specific theoretical model.
The large gap between the exponential and empirical percentiles implies that the exponential distribution does not fit the data well. A good fit would result in closer percentile values between the exponential and empirical distributions. If the data were perfectly modeled by an exponential distribution, the percentiles would be closer to the empirical values. The large difference suggests that the exponential model may not capture the nuances of the data.
The empirical percentiles are based on the actual data distribution, indicating that the approach may be more appropriate for summarizing and understanding the data. Given the discrepancy, relying on the exponential distribution may lead to misleading conclusions about the data. The exponential model assumes a specific type of skewness and tail behavior that may not be present the the TotalBsmtSF data.