#Task 1: Create 5 Random Subsamples
# Loading necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the data from the CSV file
NY_House_Dataset <- read.csv("C:\\Users\\velag\\Downloads\\NY-House-Dataset.csv")

# Set seed for reproducibility
set.seed(123)

# Create 5 random subsamples
sample_size <- round(nrow(NY_House_Dataset) * 0.5)

df_1 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_2 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_3 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_4 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
df_5 <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)

# Displaying summary statistics for each data frame
summary_df_1 <- df_1 %>% summary()
summary_df_2 <- df_2 %>% summary()
summary_df_3 <- df_3 %>% summary()
summary_df_4 <- df_4 %>% summary()
summary_df_5 <- df_5 %>% summary()

summary_df_1
##  BROKERTITLE            TYPE               PRICE                BEDS       
##  Length:2400        Length:2400        Min.   :     2494   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:   498000   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :   810000   Median : 3.000  
##                                        Mean   :  1961327   Mean   : 3.341  
##                                        3rd Qu.:  1500000   3rd Qu.: 4.000  
##                                        Max.   :195000000   Max.   :50.000  
##       BATH         PROPERTYSQFT     ADDRESS             STATE          
##  Min.   : 0.000   Min.   :  275   Length:2400        Length:2400       
##  1st Qu.: 1.000   1st Qu.: 1213   Class :character   Class :character  
##  Median : 2.000   Median : 2184   Mode  :character   Mode  :character  
##  Mean   : 2.355   Mean   : 2115                                        
##  3rd Qu.: 3.000   3rd Qu.: 2184                                        
##  Max.   :50.000   Max.   :22035                                        
##  ADMINISTRATIVE_AREA_LEVEL_2   LOCALITY         SUBLOCALITY       
##  Length:2400                 Length:2400        Length:2400       
##  Class :character            Class :character   Class :character  
##  Mode  :character            Mode  :character   Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##  STREET_NAME         LONG_NAME         FORMATTED_ADDRESS     LATITUDE    
##  Length:2400        Length:2400        Length:2400        Min.   :40.50  
##  Class :character   Class :character   Class :character   1st Qu.:40.64  
##  Mode  :character   Mode  :character   Mode  :character   Median :40.73  
##                                                           Mean   :40.71  
##                                                           3rd Qu.:40.77  
##                                                           Max.   :40.91  
##    LONGITUDE     
##  Min.   :-74.25  
##  1st Qu.:-73.99  
##  Median :-73.95  
##  Mean   :-73.94  
##  3rd Qu.:-73.88  
##  Max.   :-73.70
summary_df_2
##  BROKERTITLE            TYPE               PRICE                BEDS       
##  Length:2400        Length:2400        Min.   :2.494e+03   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:4.990e+05   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :8.000e+05   Median : 3.000  
##                                        Mean   :2.780e+06   Mean   : 3.291  
##                                        3rd Qu.:1.449e+06   3rd Qu.: 4.000  
##                                        Max.   :2.147e+09   Max.   :32.000  
##       BATH         PROPERTYSQFT     ADDRESS             STATE          
##  Min.   : 0.000   Min.   :  250   Length:2400        Length:2400       
##  1st Qu.: 1.000   1st Qu.: 1186   Class :character   Class :character  
##  Median : 2.000   Median : 2184   Mode  :character   Mode  :character  
##  Mean   : 2.295   Mean   : 2164                                        
##  3rd Qu.: 3.000   3rd Qu.: 2184                                        
##  Max.   :20.000   Max.   :65535                                        
##  ADMINISTRATIVE_AREA_LEVEL_2   LOCALITY         SUBLOCALITY       
##  Length:2400                 Length:2400        Length:2400       
##  Class :character            Class :character   Class :character  
##  Mode  :character            Mode  :character   Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##  STREET_NAME         LONG_NAME         FORMATTED_ADDRESS     LATITUDE    
##  Length:2400        Length:2400        Length:2400        Min.   :40.50  
##  Class :character   Class :character   Class :character   1st Qu.:40.64  
##  Mode  :character   Mode  :character   Mode  :character   Median :40.73  
##                                                           Mean   :40.71  
##                                                           3rd Qu.:40.77  
##                                                           Max.   :40.91  
##    LONGITUDE     
##  Min.   :-74.25  
##  1st Qu.:-73.99  
##  Median :-73.95  
##  Mean   :-73.94  
##  3rd Qu.:-73.87  
##  Max.   :-73.70
summary_df_3
##  BROKERTITLE            TYPE               PRICE                BEDS       
##  Length:2400        Length:2400        Min.   :     2494   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:   499000   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :   800000   Median : 3.000  
##                                        Mean   :  2017308   Mean   : 3.344  
##                                        3rd Qu.:  1495000   3rd Qu.: 4.000  
##                                        Max.   :195000000   Max.   :32.000  
##       BATH         PROPERTYSQFT     ADDRESS             STATE          
##  Min.   : 0.000   Min.   :  260   Length:2400        Length:2400       
##  1st Qu.: 1.000   1st Qu.: 1230   Class :character   Class :character  
##  Median : 2.000   Median : 2184   Mode  :character   Mode  :character  
##  Mean   : 2.349   Mean   : 2279                                        
##  3rd Qu.: 3.000   3rd Qu.: 2184                                        
##  Max.   :32.000   Max.   :65535                                        
##  ADMINISTRATIVE_AREA_LEVEL_2   LOCALITY         SUBLOCALITY       
##  Length:2400                 Length:2400        Length:2400       
##  Class :character            Class :character   Class :character  
##  Mode  :character            Mode  :character   Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##  STREET_NAME         LONG_NAME         FORMATTED_ADDRESS     LATITUDE    
##  Length:2400        Length:2400        Length:2400        Min.   :40.50  
##  Class :character   Class :character   Class :character   1st Qu.:40.64  
##  Mode  :character   Mode  :character   Mode  :character   Median :40.73  
##                                                           Mean   :40.71  
##                                                           3rd Qu.:40.77  
##                                                           Max.   :40.91  
##    LONGITUDE     
##  Min.   :-74.25  
##  1st Qu.:-73.99  
##  Median :-73.95  
##  Mean   :-73.94  
##  3rd Qu.:-73.86  
##  Max.   :-73.70
summary_df_4
##  BROKERTITLE            TYPE               PRICE                BEDS       
##  Length:2400        Length:2400        Min.   :2.494e+03   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:4.838e+05   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :8.000e+05   Median : 3.000  
##                                        Mean   :3.785e+06   Mean   : 3.246  
##                                        3rd Qu.:1.488e+06   3rd Qu.: 4.000  
##                                        Max.   :2.147e+09   Max.   :40.000  
##       BATH         PROPERTYSQFT     ADDRESS             STATE          
##  Min.   : 1.000   Min.   :  246   Length:2400        Length:2400       
##  1st Qu.: 1.000   1st Qu.: 1214   Class :character   Class :character  
##  Median : 2.000   Median : 2184   Mode  :character   Mode  :character  
##  Mean   : 2.301   Mean   : 2152                                        
##  3rd Qu.: 3.000   3rd Qu.: 2184                                        
##  Max.   :20.000   Max.   :55300                                        
##  ADMINISTRATIVE_AREA_LEVEL_2   LOCALITY         SUBLOCALITY       
##  Length:2400                 Length:2400        Length:2400       
##  Class :character            Class :character   Class :character  
##  Mode  :character            Mode  :character   Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##  STREET_NAME         LONG_NAME         FORMATTED_ADDRESS     LATITUDE    
##  Length:2400        Length:2400        Length:2400        Min.   :40.50  
##  Class :character   Class :character   Class :character   1st Qu.:40.64  
##  Mode  :character   Mode  :character   Mode  :character   Median :40.73  
##                                                           Mean   :40.71  
##                                                           3rd Qu.:40.77  
##                                                           Max.   :40.91  
##    LONGITUDE     
##  Min.   :-74.25  
##  1st Qu.:-73.99  
##  Median :-73.95  
##  Mean   :-73.94  
##  3rd Qu.:-73.87  
##  Max.   :-73.70
summary_df_5
##  BROKERTITLE            TYPE               PRICE                BEDS       
##  Length:2400        Length:2400        Min.   :4.950e+04   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:4.750e+05   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :8.000e+05   Median : 3.000  
##                                        Mean   :2.900e+06   Mean   : 3.381  
##                                        3rd Qu.:1.465e+06   3rd Qu.: 4.000  
##                                        Max.   :2.147e+09   Max.   :50.000  
##       BATH         PROPERTYSQFT     ADDRESS             STATE          
##  Min.   : 0.000   Min.   :  230   Length:2400        Length:2400       
##  1st Qu.: 1.000   1st Qu.: 1192   Class :character   Class :character  
##  Median : 2.000   Median : 2184   Mode  :character   Mode  :character  
##  Mean   : 2.394   Mean   : 2238                                        
##  3rd Qu.: 3.000   3rd Qu.: 2184                                        
##  Max.   :50.000   Max.   :65535                                        
##  ADMINISTRATIVE_AREA_LEVEL_2   LOCALITY         SUBLOCALITY       
##  Length:2400                 Length:2400        Length:2400       
##  Class :character            Class :character   Class :character  
##  Mode  :character            Mode  :character   Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##  STREET_NAME         LONG_NAME         FORMATTED_ADDRESS     LATITUDE    
##  Length:2400        Length:2400        Length:2400        Min.   :40.50  
##  Class :character   Class :character   Class :character   1st Qu.:40.64  
##  Mode  :character   Mode  :character   Mode  :character   Median :40.72  
##                                                           Mean   :40.71  
##                                                           3rd Qu.:40.77  
##                                                           Max.   :40.91  
##    LONGITUDE     
##  Min.   :-74.25  
##  1st Qu.:-73.99  
##  Median :-73.95  
##  Mean   :-73.94  
##  3rd Qu.:-73.87  
##  Max.   :-73.70
#Explanation to Reader:
#Creating Subsamples:
#Insight: Subsamples are created to simulate collecting data from a population.
#Significance: This helps us understand the variability in the data that could arise when sampling.
#Task 2: Scrutinize Subsamples

# Group by property type and summarize in each subsample
summary_1 <- df_1 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_2 <- df_2 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_3 <- df_3 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_4 <- df_4 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))
summary_5 <- df_5 %>% group_by(TYPE) %>% summarise(avg_price = mean(PRICE))

# Display summaries
summary_1
## # A tibble: 13 × 2
##    TYPE                       avg_price
##    <chr>                          <dbl>
##  1 Co-op for sale              1009221.
##  2 Coming Soon                 1695000 
##  3 Condo for sale              2775153.
##  4 Condop for sale              839000 
##  5 Contingent                   743068.
##  6 For sale                    2493444.
##  7 Foreclosure                 2169908.
##  8 House for sale              1379290.
##  9 Land for sale               1234181.
## 10 Mobile house for sale       1288000 
## 11 Multi-family home for sale  1572505.
## 12 Pending                     1608402.
## 13 Townhouse for sale          7368926.
summary_2
## # A tibble: 11 × 2
##    TYPE                       avg_price
##    <chr>                          <dbl>
##  1 Co-op for sale              1010140.
##  2 Condo for sale              2285626.
##  3 Condop for sale              934333.
##  4 Contingent                   741809.
##  5 For sale                    3818949.
##  6 Foreclosure                 3391557.
##  7 House for sale              5953653.
##  8 Land for sale               1202071.
##  9 Multi-family home for sale  1845679.
## 10 Pending                     1043581.
## 11 Townhouse for sale          7045632.
summary_3
## # A tibble: 12 × 2
##    TYPE                       avg_price
##    <chr>                          <dbl>
##  1 Co-op for sale              1229866.
##  2 Coming Soon                  649000 
##  3 Condo for sale              3222034.
##  4 Condop for sale             1125000 
##  5 Contingent                   737901.
##  6 For sale                    2888749 
##  7 Foreclosure                 1886100 
##  8 House for sale              1615732.
##  9 Land for sale                761723.
## 10 Multi-family home for sale  1634963.
## 11 Pending                     1206786.
## 12 Townhouse for sale          6272132.
summary_4
## # A tibble: 12 × 2
##    TYPE                       avg_price
##    <chr>                          <dbl>
##  1 Co-op for sale              1143785.
##  2 Coming Soon                 1695000 
##  3 Condo for sale              2879929.
##  4 Condop for sale             1110000 
##  5 Contingent                   746550.
##  6 For sale                    1868143.
##  7 Foreclosure                 2007414.
##  8 House for sale              9749676.
##  9 Land for sale                687036 
## 10 Multi-family home for sale  1685311.
## 11 Pending                     1825273.
## 12 Townhouse for sale          6677086.
summary_5
## # A tibble: 11 × 2
##    TYPE                       avg_price
##    <chr>                          <dbl>
##  1 Co-op for sale              1009669.
##  2 Coming Soon                  910500 
##  3 Condo for sale              3444335.
##  4 Contingent                   809024.
##  5 For sale                    1965667.
##  6 Foreclosure                  555543.
##  7 House for sale              5576208.
##  8 Land for sale               1232289.
##  9 Multi-family home for sale  1768984.
## 10 Pending                     1501943.
## 11 Townhouse for sale          6495648.
#Explanation to Reader:
#Scrutinizing Subsamples:
#Insight: Each subsample may have different characteristics.
#Significance: Identifying variations in averages for different property types helps us recognize the impact of sampling variability.
#Task 3: Monte Carlo Simulation

# Monte Carlo Simulation for Average Price (1000 iterations)
simulations <- replicate(1000, {
  sample_data <- NY_House_Dataset %>% sample_n(size = sample_size, replace = TRUE)
  mean(sample_data$PRICE)
})

# Plot the distribution of simulated means
hist(simulations, main = "Monte Carlo Simulation of Average Price",
     xlab = "Simulated Mean Price", col = "lightblue", border = "black")

#Explanation to Reader:
#Monte Carlo Simulation:
#Insight: Simulating multiple samples helps visualize the distribution of sample means.
#Significance: It demonstrates the potential range of average prices we might observe due to sampling variability.
#Task 4: Considerations for Drawing Conclusions

# Load necessary libraries
library(ggplot2)
# Comparing summary statistics across subsamples
compare_summaries <- bind_rows(
  mutate(summary_1, Sample = "1"),
  mutate(summary_2, Sample = "2"),
  mutate(summary_3, Sample = "3"),
  mutate(summary_4, Sample = "4"),
  mutate(summary_5, Sample = "5")
)

# Visualizing average prices in subsamples
ggplot(compare_summaries, aes(x = Sample, y = avg_price, fill = TYPE)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Average Prices in Subsamples",
       x = "Subsample", y = "Average Price", fill = "Property Type")

#Explanation to Reader:
#Considering Conclusions:
#Insight: Comparing summary statistics and visualizing differences.
#Significance: Identifying consistent patterns or variations across subsamples informs us about the stability of conclusions drawn from the data.
#Task 5: Further Questions

# Further Questions
further_questions_sampling <- c(
  "How does the variability in subsamples impact our confidence in average price estimates?",
  "Are there specific property types that exhibit more variability in subsample averages?",
  "What is the trade-off between larger and smaller subsample sizes in terms of result stability?"
)

further_questions_sampling
## [1] "How does the variability in subsamples impact our confidence in average price estimates?"      
## [2] "Are there specific property types that exhibit more variability in subsample averages?"        
## [3] "What is the trade-off between larger and smaller subsample sizes in terms of result stability?"
#Explanation to Reader:
#Further Questions:
#Insight: Reflecting on implications and raising additional questions.
#Significance: Encourages critical thinking about the reliability of conclusions and potential avenues for future exploration.