```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
str(ames)
## 'data.frame': 2930 obs. of 82 variables:
## $ Order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ PID : int 526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
## $ MS.SubClass : int 20 20 20 20 60 60 120 120 120 60 ...
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "NAmes" "NAmes" "NAmes" "NAmes" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" NA NA "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pool.QC : chr NA NA NA NA ...
## $ Fence : chr NA "MnPrv" NA NA ...
## $ Misc.Feature : chr NA NA "Gar2" NA ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Mo.Sold : int 5 6 6 4 3 6 4 1 3 6 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
set.seed(42) # For reproducibility
sample_size <- round(0.5 * nrow(ames))
df_1 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
df_2 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
df_3 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
df_4 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
df_5 <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
samples <- list(df_1, df_2, df_3, df_4, df_5)
# Compare descriptive statistics for 'SalePrice' and 'Lot.Area'
sample_summary <- function(df) {
df %>%
summarise(SalePrice_Mean = mean(SalePrice, na.rm = TRUE),
SalePrice_SD = sd(SalePrice, na.rm = TRUE),
LotArea_Mean = mean(Lot.Area, na.rm = TRUE),
LotArea_SD = sd(Lot.Area, na.rm = TRUE))
}
# Apply summary function to each sample
lapply(samples, sample_summary)
## [[1]]
## SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1 180784.4 80417.76 10518.42 10598.12
##
## [[2]]
## SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1 181349.9 76805.24 10213.81 9113.274
##
## [[3]]
## SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1 177605.1 79019.45 10055.88 6971.046
##
## [[4]]
## SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1 176453.1 75093.82 10019.96 6916.942
##
## [[5]]
## SalePrice_Mean SalePrice_SD LotArea_Mean LotArea_SD
## 1 180218.3 78939.46 10273.12 10002.86
# Group by MS.Zoning and calculate the mean SalePrice for each group
sample_group_analysis <- function(df) {
df %>%
group_by(MS.Zoning) %>%
summarise(Mean_SalePrice = mean(SalePrice, na.rm = TRUE),
SalePrice_SD = sd(SalePrice, na.rm = TRUE)) }
lapply(samples, sample_group_analysis)
## [[1]]
## # A tibble: 7 Ă— 3
## MS.Zoning Mean_SalePrice SalePrice_SD
## <chr> <dbl> <dbl>
## 1 A (agr) 81500 NA
## 2 C (all) 77286. 33648.
## 3 FV 220380. 51021.
## 4 I (all) 80312. 32085.
## 5 RH 135694 35014.
## 6 RL 190832. 83162.
## 7 RM 128576. 41940.
##
## [[2]]
## # A tibble: 6 Ă— 3
## MS.Zoning Mean_SalePrice SalePrice_SD
## <chr> <dbl> <dbl>
## 1 C (all) 90678 28343.
## 2 FV 219206. 48820.
## 3 I (all) 57625 NA
## 4 RH 130996. 42730.
## 5 RL 192140. 76969.
## 6 RM 125028. 50041.
##
## [[3]]
## # A tibble: 5 Ă— 3
## MS.Zoning Mean_SalePrice SalePrice_SD
## <chr> <dbl> <dbl>
## 1 C (all) 83686. 33302.
## 2 FV 222710. 57978.
## 3 RH 141135. 34430.
## 4 RL 187239. 80274.
## 5 RM 124829. 48423.
##
## [[4]]
## # A tibble: 7 Ă— 3
## MS.Zoning Mean_SalePrice SalePrice_SD
## <chr> <dbl> <dbl>
## 1 A (agr) 47300 48366.
## 2 C (all) 84929. 32004.
## 3 FV 218822. 58960.
## 4 I (all) 57625 NA
## 5 RH 142049. 30894.
## 6 RL 186906. 76194.
## 7 RM 127233. 45411.
##
## [[5]]
## # A tibble: 7 Ă— 3
## MS.Zoning Mean_SalePrice SalePrice_SD
## <chr> <dbl> <dbl>
## 1 A (agr) 81500 NA
## 2 C (all) 84997. 22092.
## 3 FV 211170. 48464.
## 4 I (all) 57625 NA
## 5 RH 127220 29399.
## 6 RL 191856. 80642.
## 7 RM 127384. 46025.
#Number of simulations
num_simulations <- 1000
simulated_means <- numeric(num_simulations)
# Perform Monte Carlo simulation to estimate SalePrice mean variability
for (i in 1:num_simulations) {
sample_data <- ames[sample(nrow(ames), sample_size, replace = TRUE), ]
simulated_means[i] <- mean(sample_data$SalePrice, na.rm = TRUE)
}
# Plot the distribution of simulated means
hist(simulated_means, breaks = 30, col = "skyblue", main = "Distribution of Simulated SalePrice Means", xlab = "Mean SalePrice")
samples_df <- bind_rows(df_1 = samples[[1]], df_2 = samples[[2]], df_3 = samples[[3]], df_4 = samples[[4]], df_5 = samples[[5]], .id = "Sample")
ggplot(samples_df, aes(x = Sample, y = SalePrice)) + geom_boxplot(fill = "lightblue") + labs(title = "Distribution of SalePrice Across Samples", x = "Sample", y = "SalePrice") + theme_minimal()
ggplot(samples_df, aes(x = Lot.Area, fill = Sample)) + geom_density(alpha = 0.6) + labs(title = "Density Plot of Lot Area Across Samples", x = "Lot Area", y = "Density") + theme_minimal() + scale_fill_brewer(palette = "Set3")
sample_summary_df <- bind_rows( sample_summary(samples[[1]]) %>% mutate(Sample = "df_1"), sample_summary(samples[[2]]) %>% mutate(Sample = "df_2"), sample_summary(samples[[3]]) %>% mutate(Sample = "df_3"), sample_summary(samples[[4]]) %>% mutate(Sample = "df_4"), sample_summary(samples[[5]]) %>% mutate(Sample = "df_5") )
ggplot(sample_summary_df, aes(x = Sample)) + geom_bar(aes(y = SalePrice_Mean), stat = "identity", fill = "skyblue") + geom_errorbar(aes(ymin = SalePrice_Mean - SalePrice_SD, ymax = SalePrice_Mean + SalePrice_SD), width = 0.2) + labs(title = "Mean and Standard Deviation of SalePrice Across Samples", y = "Mean SalePrice") + theme_minimal()
ggplot(data.frame(simulated_means), aes(x = 1:num_simulations, y = simulated_means)) + geom_line(color = "blue") + geom_hline(yintercept = mean(simulated_means), linetype = "dashed", color = "red") + labs(title = "Monte Carlo Simulation of SalePrice Means", x = "Simulation Iteration", y = "Mean SalePrice") + theme_minimal()