library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(stats)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(e1071)

data <- read.csv("/Users/mrobinson/Downloads/train.csv")



skewness(data$LotArea)
## [1] 12.18262
skewness(data$SalePrice)
## [1] 1.879009

Quartiles

if (!("LotArea" %in% names(data)) || !("SalePrice" %in% names(data))) {
  stop("Data must contain 'LotArea' and 'SalePrice' columns")
}

x_3rd_quartile <- quantile(data$LotArea, 0.75)
y_2nd_quartile <- quantile(data$SalePrice, 0.50)

data$X_gt_3rdQ <- data$LotArea > x_3rd_quartile
data$Y_gt_2ndQ <- data$SalePrice > y_2nd_quartile

p_XgtX_given_YgtY <- sum(data$X_gt_3rdQ & data$Y_gt_2ndQ) / sum(data$Y_gt_2ndQ)
p_XgtX_and_YgtY <- sum(data$X_gt_3rdQ & data$Y_gt_2ndQ) / nrow(data)
p_XltX_given_YgtY <- sum(!data$X_gt_3rdQ & data$Y_gt_2ndQ) / sum(data$Y_gt_2ndQ)


table_of_counts <- with(data, table(Xy= data$X_gt_3rdQ, Y_gt_2ndQ = data$Y_gt_2ndQ))
colnames(table_of_counts) <- c("Y <= 2nd quartile", "Y > 2nd quartile")
rownames(table_of_counts) <- c("X <= 3rd quartile", "X > 3rd quartile")

table_of_counts <- addmargins(table_of_counts)

print(paste("P(X > x | Y > y):", p_XgtX_given_YgtY))
## [1] "P(X > x | Y > y): 0.379120879120879"
print(paste("P(X > x, Y > y):", p_XgtX_and_YgtY))
## [1] "P(X > x, Y > y): 0.189041095890411"
print(paste("P(X < x | Y > y):", p_XltX_given_YgtY))
## [1] "P(X < x | Y > y): 0.620879120879121"
print(table_of_counts)
##                    Y_gt_2ndQ
## Xy                  Y <= 2nd quartile Y > 2nd quartile  Sum
##   X <= 3rd quartile               643              452 1095
##   X > 3rd quartile                 89              276  365
##   Sum                             732              728 1460

Probability.

data$A <- data$LotArea > quantile(data$LotArea, 0.75)
data$B <- data$SalePrice > quantile(data$SalePrice, 0.50)

p_A <- mean(data$A)
p_B <- mean(data$B)
p_A_and_B <- mean(data$A & data$B)
p_A_given_B <- p_A_and_B / p_B  # Conditional probability P(A|B)

independence_check <- p_A_given_B == p_A * p_B

table_of_counts <- table(data$A, data$B)
chi_square_results <- chisq.test(table_of_counts)

print(paste("P(A):", p_A))
## [1] "P(A): 0.25"
print(paste("P(B):", p_B))
## [1] "P(B): 0.498630136986301"
print(paste("P(A and B):", p_A_and_B))
## [1] "P(A and B): 0.189041095890411"
print(paste("P(A|B):", p_A_given_B))
## [1] "P(A|B): 0.379120879120879"
print(paste("Are A and B independent based on product rule?:", independence_check))
## [1] "Are A and B independent based on product rule?: FALSE"
print(chi_square_results)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table_of_counts
## X-squared = 127.74, df = 1, p-value < 2.2e-16

No, splitting the training data based on these quartiles does not result in independent subsets. The conditional probability P(A∣B) was significantly different from the product P(A)P(B), and the Chi-Square test for independence returned a very small p-value (less than 2.2e-16)

Descriptive and Inferential Statistics.

summary(data$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
summary(data$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
# Histograms
hist(data$LotArea, main="Histogram of Lot Area", xlab="Lot Area (sq ft)", col="blue", breaks=50)

hist(data$SalePrice, main="Histogram of Sale Price", xlab="Sale Price ($)", col="red", breaks=50)

plot(data$LotArea, data$SalePrice, main="Scatterplot of Lot Area vs. Sale Price",
     xlab="Lot Area (sq ft)", ylab="Sale Price ($)", pch=19, col=rgb(0, 0, 1, 0.5))

t_test <- t.test(data$LotArea, data$SalePrice, conf.level=0.95)
print(t_test)
## 
##  Welch Two Sample t-test
## 
## data:  data$LotArea and data$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -174514.7 -166294.1
## sample estimates:
## mean of x mean of y 
##  10516.83 180921.20
cor_matrix <- cor(data[,c("LotArea", "GrLivArea")])
print(cor_matrix)
##             LotArea GrLivArea
## LotArea   1.0000000 0.2631162
## GrLivArea 0.2631162 1.0000000
cor_test <- cor.test(data$LotArea, data$GrLivArea, conf.level=0.99)
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  data$LotArea and data$GrLivArea
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.1992693 0.3247386
## sample estimates:
##       cor 
## 0.2631162

Linear Algebra and Correlation.

cor_matrix <- cor(data[,c("LotArea", "GrLivArea")])

precision_matrix <- solve(cor_matrix)

identity_matrix_from_cor <- cor_matrix %*% precision_matrix
identity_matrix_from_precision <- precision_matrix %*% cor_matrix

print(identity_matrix_from_cor)
##           LotArea     GrLivArea
## LotArea         1 -2.243337e-17
## GrLivArea       0  1.000000e+00
print(identity_matrix_from_precision)
##                 LotArea GrLivArea
## LotArea    1.000000e+00         0
## GrLivArea -2.243337e-17         1
pca_results <- prcomp(data[,c("LotArea", "GrLivArea")], scale. = TRUE)

summary(pca_results)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.1239 0.8584
## Proportion of Variance 0.6316 0.3684
## Cumulative Proportion  0.6316 1.0000
screeplot(pca_results, type="lines")

biplot(pca_results)

Calculus-Based Probability & Statistics.

adjusted_LotArea <- data$LotArea - min(data$LotArea) + 1

fit <- fitdistr(adjusted_LotArea, densfun = "exponential")

print(fit)
##        rate    
##   1.084854e-04 
##  (2.839193e-06)
lambda <- 1 / fit$estimate  # Rate parameter of the exponential distribution
samples <- rexp(1000, rate = lambda)

hist(adjusted_LotArea, main="Histogram of Adjusted Lot Area", xlab="Adjusted Lot Area", col=rgb(0.2, 0.4, 0.6, 0.5), breaks=50)
hist(samples, main="Histogram of Simulated Data", xlab="Simulated Lot Area", col=rgb(0.6, 0.4, 0.2, 0.5), breaks=50, add=T)
legend("topright", legend=c("Adjusted Data", "Simulated Data"), fill=c(rgb(0.2, 0.4, 0.6, 0.5), rgb(0.6, 0.4, 0.2, 0.5)))

empirical_percentiles <- quantile(adjusted_LotArea, probs = c(0.05, 0.95))

exponential_percentiles <- qexp(c(0.05, 0.95), rate = lambda)

print(paste("Empirical 5th and 95th percentiles:", empirical_percentiles))
## [1] "Empirical 5th and 95th percentiles: 2012.7"  
## [2] "Empirical 5th and 95th percentiles: 16102.15"
print(paste("Exponential distribution 5th and 95th percentiles:", exponential_percentiles))
## [1] "Exponential distribution 5th and 95th percentiles: 5.5645748575682e-06"
## [2] "Exponential distribution 5th and 95th percentiles: 0.00032499328983381"
ci <- t.test(adjusted_LotArea, conf.level = 0.95)$conf.int

print(paste("95% Confidence Interval of the mean:", ci))
## [1] "95% Confidence Interval of the mean: 8705.41798983372"
## [2] "95% Confidence Interval of the mean: 9730.23817454985"

The histograms comparing the adjusted LotArea and the samples from the fitted exponential model visually demonstrates the approximate accuracy of the exponential fit to the actual data distribution, showing any discrepancies in their shapes. The percentiles provide an understanding of the distribution’s concentration, showing how the model aligns with empirical data points. The confidence interval estimates the true mean’s location of the adjusted LotArea, helping in understanding its central tendency with an assumption of normality. Finally, the suitability of the exponential fit should be discussed in relation to the empirical data and the attributes of skewed variables, including the potential limitations of taking on an exponential distribution if notable deviations are observed in the histogram comparisons.

Build a Regression Model and Generate Predictions

rf_model <- randomForest(SalePrice ~ LotArea + GrLivArea + OverallQual, data=data, ntree=500, importance=TRUE)


print(rf_model)
## 
## Call:
##  randomForest(formula = SalePrice ~ LotArea + GrLivArea + OverallQual,      data = data, ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 1361559334
##                     % Var explained: 78.41
importance(rf_model)
##              %IncMSE IncNodePurity
## LotArea     24.53604  1.604376e+12
## GrLivArea   38.96076  2.885569e+12
## OverallQual 76.65369  3.710160e+12
test_data <- read.csv("~/Downloads/test.csv")  # Ensure this dataset is preprocessed similarly
predictions <- predict(rf_model, newdata=test_data)


submission <- data.frame(Id=test_data$Id, SalePrice=predictions)
write.csv(submission, "~/Downloads/submission2.csv", row.names=FALSE)

kaggle_username <- "michael112db"
kaggle_score <- "0.18951"


print(paste("Kaggle Username:", kaggle_username))
## [1] "Kaggle Username: michael112db"
print(paste("Kaggle Score:", kaggle_score))
## [1] "Kaggle Score: 0.18951"