#Task 1: Create 5 Random Subsamples
# Loading necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the data from the CSV file
NY_House_Dataset <- read.csv("C:\\Users\\velag\\Downloads\\NY-House-Dataset.csv")
# Set seed for reproducibility
set.seed(123)
# Create 5 random subsamples
sample_size <- round(nrow(NY_House_Dataset) * 0.5)
df_1 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_2 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_3 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_4 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_5 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
# Displaying summary statistics for each data frame
summary_df_1 <- df_1 %>% summary()
summary_df_2 <- df_2 %>% summary()
summary_df_3 <- df_3 %>% summary()
summary_df_4 <- df_4 %>% summary()
summary_df_5 <- df_5 %>% summary()
summary_df_1
## BROKERTITLE TYPE PRICE BEDS
## Length:2400 Length:2400 Min. : 2494 Min. : 1.000
## Class :character Class :character 1st Qu.: 498000 1st Qu.: 2.000
## Mode :character Mode :character Median : 810000 Median : 3.000
## Mean : 1961327 Mean : 3.341
## 3rd Qu.: 1500000 3rd Qu.: 4.000
## Max. :195000000 Max. :50.000
## BATH PROPERTYSQFT ADDRESS STATE
## Min. : 0.000 Min. : 275 Length:2400 Length:2400
## 1st Qu.: 1.000 1st Qu.: 1213 Class :character Class :character
## Median : 2.000 Median : 2184 Mode :character Mode :character
## Mean : 2.355 Mean : 2115
## 3rd Qu.: 3.000 3rd Qu.: 2184
## Max. :50.000 Max. :22035
## ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## STREET_NAME LONG_NAME FORMATTED_ADDRESS LATITUDE
## Length:2400 Length:2400 Length:2400 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.64
## Mode :character Mode :character Mode :character Median :40.73
## Mean :40.71
## 3rd Qu.:40.77
## Max. :40.91
## LONGITUDE
## Min. :-74.25
## 1st Qu.:-73.99
## Median :-73.95
## Mean :-73.94
## 3rd Qu.:-73.88
## Max. :-73.70
summary_df_2
## BROKERTITLE TYPE PRICE BEDS
## Length:2400 Length:2400 Min. :2.494e+03 Min. : 1.000
## Class :character Class :character 1st Qu.:4.990e+05 1st Qu.: 2.000
## Mode :character Mode :character Median :8.000e+05 Median : 3.000
## Mean :2.780e+06 Mean : 3.291
## 3rd Qu.:1.449e+06 3rd Qu.: 4.000
## Max. :2.147e+09 Max. :32.000
## BATH PROPERTYSQFT ADDRESS STATE
## Min. : 0.000 Min. : 250 Length:2400 Length:2400
## 1st Qu.: 1.000 1st Qu.: 1186 Class :character Class :character
## Median : 2.000 Median : 2184 Mode :character Mode :character
## Mean : 2.295 Mean : 2164
## 3rd Qu.: 3.000 3rd Qu.: 2184
## Max. :20.000 Max. :65535
## ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## STREET_NAME LONG_NAME FORMATTED_ADDRESS LATITUDE
## Length:2400 Length:2400 Length:2400 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.64
## Mode :character Mode :character Mode :character Median :40.73
## Mean :40.71
## 3rd Qu.:40.77
## Max. :40.91
## LONGITUDE
## Min. :-74.25
## 1st Qu.:-73.99
## Median :-73.95
## Mean :-73.94
## 3rd Qu.:-73.87
## Max. :-73.70
summary_df_3
## BROKERTITLE TYPE PRICE BEDS
## Length:2400 Length:2400 Min. : 2494 Min. : 1.000
## Class :character Class :character 1st Qu.: 499000 1st Qu.: 2.000
## Mode :character Mode :character Median : 800000 Median : 3.000
## Mean : 2017308 Mean : 3.344
## 3rd Qu.: 1495000 3rd Qu.: 4.000
## Max. :195000000 Max. :32.000
## BATH PROPERTYSQFT ADDRESS STATE
## Min. : 0.000 Min. : 260 Length:2400 Length:2400
## 1st Qu.: 1.000 1st Qu.: 1230 Class :character Class :character
## Median : 2.000 Median : 2184 Mode :character Mode :character
## Mean : 2.349 Mean : 2279
## 3rd Qu.: 3.000 3rd Qu.: 2184
## Max. :32.000 Max. :65535
## ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## STREET_NAME LONG_NAME FORMATTED_ADDRESS LATITUDE
## Length:2400 Length:2400 Length:2400 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.64
## Mode :character Mode :character Mode :character Median :40.73
## Mean :40.71
## 3rd Qu.:40.77
## Max. :40.91
## LONGITUDE
## Min. :-74.25
## 1st Qu.:-73.99
## Median :-73.95
## Mean :-73.94
## 3rd Qu.:-73.86
## Max. :-73.70
summary_df_4
## BROKERTITLE TYPE PRICE BEDS
## Length:2400 Length:2400 Min. :2.494e+03 Min. : 1.000
## Class :character Class :character 1st Qu.:4.838e+05 1st Qu.: 2.000
## Mode :character Mode :character Median :8.000e+05 Median : 3.000
## Mean :3.785e+06 Mean : 3.246
## 3rd Qu.:1.488e+06 3rd Qu.: 4.000
## Max. :2.147e+09 Max. :40.000
## BATH PROPERTYSQFT ADDRESS STATE
## Min. : 1.000 Min. : 246 Length:2400 Length:2400
## 1st Qu.: 1.000 1st Qu.: 1214 Class :character Class :character
## Median : 2.000 Median : 2184 Mode :character Mode :character
## Mean : 2.301 Mean : 2152
## 3rd Qu.: 3.000 3rd Qu.: 2184
## Max. :20.000 Max. :55300
## ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## STREET_NAME LONG_NAME FORMATTED_ADDRESS LATITUDE
## Length:2400 Length:2400 Length:2400 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.64
## Mode :character Mode :character Mode :character Median :40.73
## Mean :40.71
## 3rd Qu.:40.77
## Max. :40.91
## LONGITUDE
## Min. :-74.25
## 1st Qu.:-73.99
## Median :-73.95
## Mean :-73.94
## 3rd Qu.:-73.87
## Max. :-73.70
summary_df_5
## BROKERTITLE TYPE PRICE BEDS
## Length:2400 Length:2400 Min. :4.950e+04 Min. : 1.000
## Class :character Class :character 1st Qu.:4.750e+05 1st Qu.: 2.000
## Mode :character Mode :character Median :8.000e+05 Median : 3.000
## Mean :2.900e+06 Mean : 3.381
## 3rd Qu.:1.465e+06 3rd Qu.: 4.000
## Max. :2.147e+09 Max. :50.000
## BATH PROPERTYSQFT ADDRESS STATE
## Min. : 0.000 Min. : 230 Length:2400 Length:2400
## 1st Qu.: 1.000 1st Qu.: 1192 Class :character Class :character
## Median : 2.000 Median : 2184 Mode :character Mode :character
## Mean : 2.394 Mean : 2238
## 3rd Qu.: 3.000 3rd Qu.: 2184
## Max. :50.000 Max. :65535
## ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## STREET_NAME LONG_NAME FORMATTED_ADDRESS LATITUDE
## Length:2400 Length:2400 Length:2400 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.64
## Mode :character Mode :character Mode :character Median :40.72
## Mean :40.71
## 3rd Qu.:40.77
## Max. :40.91
## LONGITUDE
## Min. :-74.25
## 1st Qu.:-73.99
## Median :-73.95
## Mean :-73.94
## 3rd Qu.:-73.87
## Max. :-73.70
#Explanation to Reader:
#Creating Subsamples:
#Insight: Subsamples are created to simulate collecting data from a population.
#Significance: This helps us understand the variability in the data that could arise when sampling.
#Task 2: Scrutinize Subsamples
# Group by property type and summarize in each subsample
summary_1 <- df_1 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_2 <- df_2 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_3 <- df_3 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_4 <- df_4 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_5 <- df_5 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
# Display summaries
summary_1
## # A tibble: 13 × 2
## TYPE avg_price
## <chr> <dbl>
## 1 Co-op for sale 1009221.
## 2 Coming Soon 1695000
## 3 Condo for sale 2775153.
## 4 Condop for sale 839000
## 5 Contingent 743068.
## 6 For sale 2493444.
## 7 Foreclosure 2169908.
## 8 House for sale 1379290.
## 9 Land for sale 1234181.
## 10 Mobile house for sale 1288000
## 11 Multi-family home for sale 1572505.
## 12 Pending 1608402.
## 13 Townhouse for sale 7368926.
summary_2
## # A tibble: 11 × 2
## TYPE avg_price
## <chr> <dbl>
## 1 Co-op for sale 1010140.
## 2 Condo for sale 2285626.
## 3 Condop for sale 934333.
## 4 Contingent 741809.
## 5 For sale 3818949.
## 6 Foreclosure 3391557.
## 7 House for sale 5953653.
## 8 Land for sale 1202071.
## 9 Multi-family home for sale 1845679.
## 10 Pending 1043581.
## 11 Townhouse for sale 7045632.
summary_3
## # A tibble: 12 × 2
## TYPE avg_price
## <chr> <dbl>
## 1 Co-op for sale 1229866.
## 2 Coming Soon 649000
## 3 Condo for sale 3222034.
## 4 Condop for sale 1125000
## 5 Contingent 737901.
## 6 For sale 2888749
## 7 Foreclosure 1886100
## 8 House for sale 1615732.
## 9 Land for sale 761723.
## 10 Multi-family home for sale 1634963.
## 11 Pending 1206786.
## 12 Townhouse for sale 6272132.
summary_4
## # A tibble: 12 × 2
## TYPE avg_price
## <chr> <dbl>
## 1 Co-op for sale 1143785.
## 2 Coming Soon 1695000
## 3 Condo for sale 2879929.
## 4 Condop for sale 1110000
## 5 Contingent 746550.
## 6 For sale 1868143.
## 7 Foreclosure 2007414.
## 8 House for sale 9749676.
## 9 Land for sale 687036
## 10 Multi-family home for sale 1685311.
## 11 Pending 1825273.
## 12 Townhouse for sale 6677086.
summary_5
## # A tibble: 11 × 2
## TYPE avg_price
## <chr> <dbl>
## 1 Co-op for sale 1009669.
## 2 Coming Soon 910500
## 3 Condo for sale 3444335.
## 4 Contingent 809024.
## 5 For sale 1965667.
## 6 Foreclosure 555543.
## 7 House for sale 5576208.
## 8 Land for sale 1232289.
## 9 Multi-family home for sale 1768984.
## 10 Pending 1501943.
## 11 Townhouse for sale 6495648.
#Explanation to Reader:
#Scrutinizing Subsamples:
#Insight: Each subsample may have different characteristics.
#Significance: Identifying variations in averages for different property types helps us recognize the impact of sampling variability.
#Task 3: Monte Carlo Simulation
# Monte Carlo Simulation for Average Price (1000 iterations)
simulations <- replicate(1000, {
sample_data <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
mean(sample_data$PRICE)
})
# Plot the distribution of simulated means
hist(simulations, main = "Monte Carlo Simulation of Average Price",
xlab = "Simulated Mean Price", col = "lightblue", border = "black")

#Explanation to Reader:
#Monte Carlo Simulation:
#Insight: Simulating multiple samples helps visualize the distribution of sample means.
#Significance: It demonstrates the potential range of average prices we might observe due to sampling variability.
#Task 4: Considerations for Drawing Conclusions
# Load necessary libraries
library(ggplot2)
# Comparing summary statistics across subsamples
compare_summaries <- bind_rows(
mutate(summary_1, Sample = "1"),
mutate(summary_2, Sample = "2"),
mutate(summary_3, Sample = "3"),
mutate(summary_4, Sample = "4"),
mutate(summary_5, Sample = "5")
)
# Visualizing average prices in subsamples
ggplot(compare_summaries, aes(x = Sample, y = avg_price, fill = TYPE)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Average Prices in Subsamples",
x = "Subsample", y = "Average Price", fill = "Property Type")

#Explanation to Reader:
#Considering Conclusions:
#Insight: Comparing summary statistics and visualizing differences.
#Significance: Identifying consistent patterns or variations across subsamples informs us about the stability of conclusions drawn from the data.
#Task 5: Further Questions
# Further Questions
further_questions_sampling <- c(
"How does the variability in subsamples impact our confidence in average price estimates?",
"Are there specific property types that exhibit more variability in subsample averages?",
"What is the trade-off between larger and smaller subsample sizes in terms of result stability?"
)
further_questions_sampling
## [1] "How does the variability in subsamples impact our confidence in average price estimates?"
## [2] "Are there specific property types that exhibit more variability in subsample averages?"
## [3] "What is the trade-off between larger and smaller subsample sizes in terms of result stability?"
#Explanation to Reader:
#Further Questions:
#Insight: Reflecting on implications and raising additional questions.
#Significance: Encourages critical thinking about the reliability of conclusions and potential avenues for future exploration.