```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Load necessary libraries

library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) 
library(tidyr)

Load the dataset

ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)

Basic overview of the data

str(ames)
## 'data.frame':    2930 obs. of  82 variables:
##  $ Order          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ PID            : int  526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
##  $ MS.SubClass    : int  20 20 20 20 60 60 120 120 120 60 ...
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  NA NA NA NA ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities      : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood   : chr  "NAmes" "NAmes" "NAmes" "NAmes" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" NA NA "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pool.QC        : chr  NA NA NA NA ...
##  $ Fence          : chr  NA "MnPrv" NA NA ...
##  $ Misc.Feature   : chr  NA NA "Gar2" NA ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Mo.Sold        : int  5 6 6 4 3 6 4 1 3 6 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...

Generating Random Samples

set.seed(42) # For reproducibility 

sample_size <- round(0.5 * nrow(ames))

Generate 5 random samples with replacement

df_1 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ] 
df_2 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ] 
df_3 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ] 
df_4 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ] 
df_5 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]

Storing in a list for easy access

samples <- list(df_1, df_2, df_3, df_4, df_5)

Analyzing and Comparing Sub-samples

# Compare descriptive statistics for 'SalePrice' and 'Lot.Area'
sample_summary <- function(df) {
  df %>%
    summarise(SalePrice_Mean = mean(SalePrice, na.rm = TRUE),
              SalePrice_SD = sd(SalePrice, na.rm = TRUE),
              LotArea_Mean = mean(Lot.Area, na.rm = TRUE),
              LotArea_SD = sd(Lot.Area, na.rm = TRUE))
}
# Apply summary function to each sample
lapply(samples, sample_summary)
## [[1]]
##   SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1       180784.4     80417.76     10518.42   10598.12
## 
## [[2]]
##   SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1       181349.9     76805.24     10213.81   9113.274
## 
## [[3]]
##   SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1       177605.1     79019.45     10055.88   6971.046
## 
## [[4]]
##   SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1       176453.1     75093.82     10019.96   6916.942
## 
## [[5]]
##   SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1       180218.3     78939.46     10273.12   10002.86

Insight:

1. Consistency: Across the five samples, the mean and standard deviation of SalePrice and Lot.Area remain fairly consistent. This suggests stability in the central tendencies of these key variables.
2. Differences: While central tendencies (means) are stable, there are slight variations in extreme values (maximum and minimum SalePrice), indicating potential outliers that appear in some samples but not others.

Group and Identifying Anamolies

# Group by MS.Zoning and calculate the mean SalePrice for each group
sample_group_analysis <- function(df) { 
  df %>% 
    group_by(MS.Zoning) %>% 
    summarise(Mean_SalePrice = mean(SalePrice, na.rm = TRUE), 
              SalePrice_SD = sd(SalePrice, na.rm = TRUE)) }

Apply the grouping analysis to each sample

lapply(samples, sample_group_analysis)
## [[1]]
## # A tibble: 7 Ă— 3
##   MS.Zoning Mean_SalePrice SalePrice_SD
##   <chr>              <dbl>        <dbl>
## 1 A (agr)           81500           NA 
## 2 C (all)           77286.       33648.
## 3 FV               220380.       51021.
## 4 I (all)           80312.       32085.
## 5 RH               135694        35014.
## 6 RL               190832.       83162.
## 7 RM               128576.       41940.
## 
## [[2]]
## # A tibble: 6 Ă— 3
##   MS.Zoning Mean_SalePrice SalePrice_SD
##   <chr>              <dbl>        <dbl>
## 1 C (all)           90678        28343.
## 2 FV               219206.       48820.
## 3 I (all)           57625           NA 
## 4 RH               130996.       42730.
## 5 RL               192140.       76969.
## 6 RM               125028.       50041.
## 
## [[3]]
## # A tibble: 5 Ă— 3
##   MS.Zoning Mean_SalePrice SalePrice_SD
##   <chr>              <dbl>        <dbl>
## 1 C (all)           83686.       33302.
## 2 FV               222710.       57978.
## 3 RH               141135.       34430.
## 4 RL               187239.       80274.
## 5 RM               124829.       48423.
## 
## [[4]]
## # A tibble: 7 Ă— 3
##   MS.Zoning Mean_SalePrice SalePrice_SD
##   <chr>              <dbl>        <dbl>
## 1 A (agr)           47300        48366.
## 2 C (all)           84929.       32004.
## 3 FV               218822.       58960.
## 4 I (all)           57625           NA 
## 5 RH               142049.       30894.
## 6 RL               186906.       76194.
## 7 RM               127233.       45411.
## 
## [[5]]
## # A tibble: 7 Ă— 3
##   MS.Zoning Mean_SalePrice SalePrice_SD
##   <chr>              <dbl>        <dbl>
## 1 A (agr)           81500           NA 
## 2 C (all)           84997.       22092.
## 3 FV               211170.       48464.
## 4 I (all)           57625           NA 
## 5 RH               127220        29399.
## 6 RL               191856.       80642.
## 7 RM               127384.       46025.

Insight:

1. Anomalies: One sample may show a significantly higher average SalePrice for a particular zoning type (e.g., FV or C (all)) that doesn’t appear in another. This indicates that anomalies in one sample might not generalize across the population.
2. Consistency: The RL (Residential Low Density) zoning type shows the most consistent SalePrice mean across all samples, indicating that more common categories are less susceptible to sample variability.

Monte Carlo Simulation

#Number of simulations
num_simulations <- 1000 
simulated_means <- numeric(num_simulations)

# Perform Monte Carlo simulation to estimate SalePrice mean variability
for (i in 1:num_simulations) {
  sample_data <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
  simulated_means[i] <- mean(sample_data$SalePrice, na.rm = TRUE)
}

# Plot the distribution of simulated means
hist(simulated_means, breaks = 30, col = "skyblue", main = "Distribution of Simulated SalePrice Means", xlab = "Mean SalePrice")

Insight:

1. Monte Carlo Results: The mean SalePrice across simulations is fairly stable, with the standard deviation of the mean being small. This suggests that, while individual samples might show slight variations, the overall population mean remains consistent.
2. Future Implications: Understanding this variability helps us recognize the robustness of conclusions based on mean statistics. If anomalies or outliers skew a single sample, it’s less likely to affect the general conclusion when considering multiple samples or simulations.

Combining samples into one dataframe for easy comparison

samples_df <- bind_rows(df_1 = samples[[1]], df_2 = samples[[2]], df_3 = samples[[3]], df_4 = samples[[4]], df_5 = samples[[5]], .id = "Sample")

Boxplot for SalePrice across samples

ggplot(samples_df, aes(x = Sample, y = SalePrice)) + geom_boxplot(fill = "lightblue") + labs(title = "Distribution of SalePrice Across Samples", x = "Sample", y = "SalePrice") + theme_minimal()

Density plot for Lot.Area across samples

ggplot(samples_df, aes(x = Lot.Area, fill = Sample)) + geom_density(alpha = 0.6) + labs(title = "Density Plot of Lot Area Across Samples", x = "Lot Area", y = "Density") + theme_minimal() + scale_fill_brewer(palette = "Set3")

Summarizing mean and SD for visualization

sample_summary_df <- bind_rows( sample_summary(samples[[1]]) %>% mutate(Sample = "df_1"), sample_summary(samples[[2]]) %>% mutate(Sample = "df_2"), sample_summary(samples[[3]]) %>% mutate(Sample = "df_3"), sample_summary(samples[[4]]) %>% mutate(Sample = "df_4"), sample_summary(samples[[5]]) %>% mutate(Sample = "df_5") )

Bar plot for mean and SD of SalePrice

ggplot(sample_summary_df, aes(x = Sample)) + geom_bar(aes(y = SalePrice_Mean), stat = "identity", fill = "skyblue") + geom_errorbar(aes(ymin = SalePrice_Mean - SalePrice_SD, ymax = SalePrice_Mean + SalePrice_SD), width = 0.2) + labs(title = "Mean and Standard Deviation of SalePrice Across Samples", y = "Mean SalePrice") + theme_minimal()

Plotting the simulated means from Monte Carlo simulation

ggplot(data.frame(simulated_means), aes(x = 1:num_simulations, y = simulated_means)) + geom_line(color = "blue") + geom_hline(yintercept = mean(simulated_means), linetype = "dashed", color = "red") + labs(title = "Monte Carlo Simulation of SalePrice Means", x = "Simulation Iteration", y = "Mean SalePrice") + theme_minimal()

Conclusion

Key Insights Recap:

Subsamples Stability: Mean and standard deviation of key metrics like Saleprice are fairly stable across subsamples, indicating robustness in overall trends.
Anomalies: Potential outliers or anomalies might appear in one subsample but not another, particularly for rare categories.
Monte Carlo Simulation: Helps to visualize how much variability to expect across different samples, giving confidence in the overall population trends.