library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(stats)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(e1071)
data <- read.csv("/Users/mrobinson/Downloads/train.csv")
skewness(data$LotArea)
## [1] 12.18262
skewness(data$SalePrice)
## [1] 1.879009
if (!("LotArea" %in% names(data)) || !("SalePrice" %in% names(data))) {
stop("Data must contain 'LotArea' and 'SalePrice' columns")
}
x_3rd_quartile <- quantile(data$LotArea, 0.75)
y_2nd_quartile <- quantile(data$SalePrice, 0.50)
data$X_gt_3rdQ <- data$LotArea > x_3rd_quartile
data$Y_gt_2ndQ <- data$SalePrice > y_2nd_quartile
p_XgtX_given_YgtY <- sum(data$X_gt_3rdQ & data$Y_gt_2ndQ) / sum(data$Y_gt_2ndQ)
p_XgtX_and_YgtY <- sum(data$X_gt_3rdQ & data$Y_gt_2ndQ) / nrow(data)
p_XltX_given_YgtY <- sum(!data$X_gt_3rdQ & data$Y_gt_2ndQ) / sum(data$Y_gt_2ndQ)
table_of_counts <- with(data, table(Xy= data$X_gt_3rdQ, Y_gt_2ndQ = data$Y_gt_2ndQ))
colnames(table_of_counts) <- c("Y <= 2nd quartile", "Y > 2nd quartile")
rownames(table_of_counts) <- c("X <= 3rd quartile", "X > 3rd quartile")
table_of_counts <- addmargins(table_of_counts)
print(paste("P(X > x | Y > y):", p_XgtX_given_YgtY))
## [1] "P(X > x | Y > y): 0.379120879120879"
print(paste("P(X > x, Y > y):", p_XgtX_and_YgtY))
## [1] "P(X > x, Y > y): 0.189041095890411"
print(paste("P(X < x | Y > y):", p_XltX_given_YgtY))
## [1] "P(X < x | Y > y): 0.620879120879121"
print(table_of_counts)
## Y_gt_2ndQ
## Xy Y <= 2nd quartile Y > 2nd quartile Sum
## X <= 3rd quartile 643 452 1095
## X > 3rd quartile 89 276 365
## Sum 732 728 1460
data$A <- data$LotArea > quantile(data$LotArea, 0.75)
data$B <- data$SalePrice > quantile(data$SalePrice, 0.50)
p_A <- mean(data$A)
p_B <- mean(data$B)
p_A_and_B <- mean(data$A & data$B)
p_A_given_B <- p_A_and_B / p_B # Conditional probability P(A|B)
independence_check <- p_A_given_B == p_A * p_B
table_of_counts <- table(data$A, data$B)
chi_square_results <- chisq.test(table_of_counts)
print(paste("P(A):", p_A))
## [1] "P(A): 0.25"
print(paste("P(B):", p_B))
## [1] "P(B): 0.498630136986301"
print(paste("P(A and B):", p_A_and_B))
## [1] "P(A and B): 0.189041095890411"
print(paste("P(A|B):", p_A_given_B))
## [1] "P(A|B): 0.379120879120879"
print(paste("Are A and B independent based on product rule?:", independence_check))
## [1] "Are A and B independent based on product rule?: FALSE"
print(chi_square_results)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table_of_counts
## X-squared = 127.74, df = 1, p-value < 2.2e-16
No, splitting the training data based on these quartiles does not result in independent subsets. The conditional probability P(A∣B) was significantly different from the product P(A)P(B), and the Chi-Square test for independence returned a very small p-value (less than 2.2e-16)
summary(data$LotArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
summary(data$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
# Histograms
hist(data$LotArea, main="Histogram of Lot Area", xlab="Lot Area (sq ft)", col="blue", breaks=50)
hist(data$SalePrice, main="Histogram of Sale Price", xlab="Sale Price ($)", col="red", breaks=50)
plot(data$LotArea, data$SalePrice, main="Scatterplot of Lot Area vs. Sale Price",
xlab="Lot Area (sq ft)", ylab="Sale Price ($)", pch=19, col=rgb(0, 0, 1, 0.5))
t_test <- t.test(data$LotArea, data$SalePrice, conf.level=0.95)
print(t_test)
##
## Welch Two Sample t-test
##
## data: data$LotArea and data$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -174514.7 -166294.1
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
cor_matrix <- cor(data[,c("LotArea", "GrLivArea")])
print(cor_matrix)
## LotArea GrLivArea
## LotArea 1.0000000 0.2631162
## GrLivArea 0.2631162 1.0000000
cor_test <- cor.test(data$LotArea, data$GrLivArea, conf.level=0.99)
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: data$LotArea and data$GrLivArea
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.1992693 0.3247386
## sample estimates:
## cor
## 0.2631162
cor_matrix <- cor(data[,c("LotArea", "GrLivArea")])
precision_matrix <- solve(cor_matrix)
identity_matrix_from_cor <- cor_matrix %*% precision_matrix
identity_matrix_from_precision <- precision_matrix %*% cor_matrix
print(identity_matrix_from_cor)
## LotArea GrLivArea
## LotArea 1 -2.243337e-17
## GrLivArea 0 1.000000e+00
print(identity_matrix_from_precision)
## LotArea GrLivArea
## LotArea 1.000000e+00 0
## GrLivArea -2.243337e-17 1
pca_results <- prcomp(data[,c("LotArea", "GrLivArea")], scale. = TRUE)
summary(pca_results)
## Importance of components:
## PC1 PC2
## Standard deviation 1.1239 0.8584
## Proportion of Variance 0.6316 0.3684
## Cumulative Proportion 0.6316 1.0000
screeplot(pca_results, type="lines")
biplot(pca_results)
adjusted_LotArea <- data$LotArea - min(data$LotArea) + 1
fit <- fitdistr(adjusted_LotArea, densfun = "exponential")
print(fit)
## rate
## 1.084854e-04
## (2.839193e-06)
lambda <- 1 / fit$estimate # Rate parameter of the exponential distribution
samples <- rexp(1000, rate = lambda)
hist(adjusted_LotArea, main="Histogram of Adjusted Lot Area", xlab="Adjusted Lot Area", col=rgb(0.2, 0.4, 0.6, 0.5), breaks=50)
hist(samples, main="Histogram of Simulated Data", xlab="Simulated Lot Area", col=rgb(0.6, 0.4, 0.2, 0.5), breaks=50, add=T)
legend("topright", legend=c("Adjusted Data", "Simulated Data"), fill=c(rgb(0.2, 0.4, 0.6, 0.5), rgb(0.6, 0.4, 0.2, 0.5)))
empirical_percentiles <- quantile(adjusted_LotArea, probs = c(0.05, 0.95))
exponential_percentiles <- qexp(c(0.05, 0.95), rate = lambda)
print(paste("Empirical 5th and 95th percentiles:", empirical_percentiles))
## [1] "Empirical 5th and 95th percentiles: 2012.7"
## [2] "Empirical 5th and 95th percentiles: 16102.15"
print(paste("Exponential distribution 5th and 95th percentiles:", exponential_percentiles))
## [1] "Exponential distribution 5th and 95th percentiles: 5.5645748575682e-06"
## [2] "Exponential distribution 5th and 95th percentiles: 0.00032499328983381"
ci <- t.test(adjusted_LotArea, conf.level = 0.95)$conf.int
print(paste("95% Confidence Interval of the mean:", ci))
## [1] "95% Confidence Interval of the mean: 8705.41798983372"
## [2] "95% Confidence Interval of the mean: 9730.23817454985"
The histograms comparing the adjusted LotArea and the samples from the fitted exponential model visually demonstrates the approximate accuracy of the exponential fit to the actual data distribution, showing any discrepancies in their shapes. The percentiles provide an understanding of the distribution’s concentration, showing how the model aligns with empirical data points. The confidence interval estimates the true mean’s location of the adjusted LotArea, helping in understanding its central tendency with an assumption of normality. Finally, the suitability of the exponential fit should be discussed in relation to the empirical data and the attributes of skewed variables, including the potential limitations of taking on an exponential distribution if notable deviations are observed in the histogram comparisons.
rf_model <- randomForest(SalePrice ~ LotArea + GrLivArea + OverallQual, data=data, ntree=500, importance=TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = SalePrice ~ LotArea + GrLivArea + OverallQual, data = data, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 1361559334
## % Var explained: 78.41
importance(rf_model)
## %IncMSE IncNodePurity
## LotArea 24.53604 1.604376e+12
## GrLivArea 38.96076 2.885569e+12
## OverallQual 76.65369 3.710160e+12
test_data <- read.csv("~/Downloads/test.csv") # Ensure this dataset is preprocessed similarly
predictions <- predict(rf_model, newdata=test_data)
submission <- data.frame(Id=test_data$Id, SalePrice=predictions)
write.csv(submission, "~/Downloads/submission2.csv", row.names=FALSE)
kaggle_username <- "michael112db"
kaggle_score <- "0.18951"
print(paste("Kaggle Username:", kaggle_username))
## [1] "Kaggle Username: michael112db"
print(paste("Kaggle Score:", kaggle_score))
## [1] "Kaggle Score: 0.18951"