# Load the data from the CSV file
NY_House_Dataset <- read.csv("C:\\Users\\velag\\Downloads\\NY-House-Dataset.csv")

# Display summary statistics for the entire data frame
summary(NY_House_Dataset)
##  BROKERTITLE            TYPE               PRICE                BEDS       
##  Length:4801        Length:4801        Min.   :2.494e+03   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:4.990e+05   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :8.250e+05   Median : 3.000  
##                                        Mean   :2.357e+06   Mean   : 3.357  
##                                        3rd Qu.:1.495e+06   3rd Qu.: 4.000  
##                                        Max.   :2.147e+09   Max.   :50.000  
##       BATH         PROPERTYSQFT     ADDRESS             STATE          
##  Min.   : 0.000   Min.   :  230   Length:4801        Length:4801       
##  1st Qu.: 1.000   1st Qu.: 1200   Class :character   Class :character  
##  Median : 2.000   Median : 2184   Mode  :character   Mode  :character  
##  Mean   : 2.374   Mean   : 2184                                        
##  3rd Qu.: 3.000   3rd Qu.: 2184                                        
##  Max.   :50.000   Max.   :65535                                        
##  ADMINISTRATIVE_AREA_LEVEL_2   LOCALITY         SUBLOCALITY       
##  Length:4801                 Length:4801        Length:4801       
##  Class :character            Class :character   Class :character  
##  Mode  :character            Mode  :character   Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##  STREET_NAME         LONG_NAME         FORMATTED_ADDRESS     LATITUDE    
##  Length:4801        Length:4801        Length:4801        Min.   :40.50  
##  Class :character   Class :character   Class :character   1st Qu.:40.64  
##  Mode  :character   Mode  :character   Mode  :character   Median :40.73  
##                                                           Mean   :40.71  
##                                                           3rd Qu.:40.77  
##                                                           Max.   :40.91  
##    LONGITUDE     
##  Min.   :-74.25  
##  1st Qu.:-73.99  
##  Median :-73.95  
##  Mean   :-73.94  
##  3rd Qu.:-73.87  
##  Max.   :-73.70
# Loading the necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Group by property type and summarize price
grouped_property_type <- NY_House_Dataset %>%
  group_by(TYPE) %>%
  summarise(avg_price = mean(PRICE), total_properties = n())

# Add a special tag to the smallest group
grouped_property_type$special_tag <- ifelse(grouped_property_type$total_properties == min(grouped_property_type$total_properties), "Smallest Group", "")

# Translate the special tag back to the original data frame
NY_House_Dataset <- left_join(NY_House_Dataset, grouped_property_type %>% select(TYPE, special_tag), by = "TYPE")

# Display the resulting data frame
head(NY_House_Dataset)
##                                                                  BROKERTITLE
## 1                                Brokered by Douglas Elliman  -111 Fifth Ave
## 2                                                        Brokered by Serhant
## 3                                                     Brokered by Sowae Corp
## 4                                                        Brokered by COMPASS
## 5 Brokered by Sotheby's International Realty - East Side Manhattan Brokerage
## 6                                                     Brokered by Sowae Corp
##                 TYPE     PRICE BEDS      BATH PROPERTYSQFT
## 1     Condo for sale    315000    2  2.000000         1400
## 2     Condo for sale 195000000    7 10.000000        17545
## 3     House for sale    260000    4  2.000000         2015
## 4     Condo for sale     69000    3  1.000000          445
## 5 Townhouse for sale  55000000    7  2.373861        14175
## 6     House for sale    690000    5  2.000000         4004
##                                                              ADDRESS
## 1                                               2 E 55th St Unit 803
## 2 Central Park Tower Penthouse-217 W 57th New York St Unit Penthouse
## 3                                                   620 Sinclair Ave
## 4                                            2 E 55th St Unit 908W33
## 5                                                        5 E 64th St
## 6                                                        584 Park Pl
##                     STATE ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY     SUBLOCALITY
## 1      New York, NY 10022             New York County New York       Manhattan
## 2      New York, NY 10019               United States New York New York County
## 3 Staten Island, NY 10312               United States New York Richmond County
## 4     Manhattan, NY 10022               United States New York New York County
## 5      New York, NY 10065               United States New York New York County
## 6      Brooklyn, NY 11238               United States New York    Kings County
##        STREET_NAME        LONG_NAME
## 1 East 55th Street  Regis Residence
## 2         New York West 57th Street
## 3    Staten Island  Sinclair Avenue
## 4         New York East 55th Street
## 5         New York East 64th Street
## 6         Brooklyn       Park Place
##                                            FORMATTED_ADDRESS LATITUDE LONGITUDE
## 1 Regis Residence, 2 E 55th St #803, New York, NY 10022, USA 40.76125 -73.97448
## 2                     217 W 57th St, New York, NY 10019, USA 40.76639 -73.98099
## 3             620 Sinclair Ave, Staten Island, NY 10312, USA 40.54181 -74.19611
## 4                       2 E 55th St, New York, NY 10022, USA 40.76140 -73.97461
## 5                       5 E 64th St, New York, NY 10065, USA 40.76722 -73.96986
## 6                       584 Park Pl, Brooklyn, NY 11238, USA 40.67436 -73.95872
##   special_tag
## 1            
## 2            
## 3            
## 4            
## 5            
## 6
# Grouping 2: Number of Bedrooms
grouped_bedrooms <- NY_House_Dataset %>%
  group_by(BEDS) %>%
  summarise(avg_price = mean(PRICE), total_properties = n())

grouped_bedrooms$special_tag <- ifelse(grouped_bedrooms$total_properties == min(grouped_bedrooms$total_properties), "Smallest Group", "")
NY_House_Dataset <- left_join(NY_House_Dataset, grouped_bedrooms %>% select(BEDS, special_tag), by = "BEDS")


# Explanation to Reader:
# The bar plot visualizes the average price for different numbers of bedrooms.
# Insights: It provides insights into how the number of bedrooms influences property prices.

# Grouping 3: Neighborhood
grouped_neighborhood <- NY_House_Dataset %>%
  group_by(LOCALITY) %>%
  summarise(avg_price = mean(PRICE), total_properties = n())

grouped_neighborhood$special_tag <- ifelse(grouped_neighborhood$total_properties == min(grouped_neighborhood$total_properties), "Smallest Group", "")
NY_House_Dataset <- left_join(NY_House_Dataset, grouped_neighborhood %>% select(LOCALITY, special_tag), by = "LOCALITY")


# Explanation to Reader:
# The bar plot visualizes the average price for different neighborhoods.
# Insights: It highlights variations in property prices across neighborhoods, aiding in understanding localized market dynamics.

# Display the first few rows of the updated data frame
head(NY_House_Dataset)
##                                                                  BROKERTITLE
## 1                                Brokered by Douglas Elliman  -111 Fifth Ave
## 2                                                        Brokered by Serhant
## 3                                                     Brokered by Sowae Corp
## 4                                                        Brokered by COMPASS
## 5 Brokered by Sotheby's International Realty - East Side Manhattan Brokerage
## 6                                                     Brokered by Sowae Corp
##                 TYPE     PRICE BEDS      BATH PROPERTYSQFT
## 1     Condo for sale    315000    2  2.000000         1400
## 2     Condo for sale 195000000    7 10.000000        17545
## 3     House for sale    260000    4  2.000000         2015
## 4     Condo for sale     69000    3  1.000000          445
## 5 Townhouse for sale  55000000    7  2.373861        14175
## 6     House for sale    690000    5  2.000000         4004
##                                                              ADDRESS
## 1                                               2 E 55th St Unit 803
## 2 Central Park Tower Penthouse-217 W 57th New York St Unit Penthouse
## 3                                                   620 Sinclair Ave
## 4                                            2 E 55th St Unit 908W33
## 5                                                        5 E 64th St
## 6                                                        584 Park Pl
##                     STATE ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY     SUBLOCALITY
## 1      New York, NY 10022             New York County New York       Manhattan
## 2      New York, NY 10019               United States New York New York County
## 3 Staten Island, NY 10312               United States New York Richmond County
## 4     Manhattan, NY 10022               United States New York New York County
## 5      New York, NY 10065               United States New York New York County
## 6      Brooklyn, NY 11238               United States New York    Kings County
##        STREET_NAME        LONG_NAME
## 1 East 55th Street  Regis Residence
## 2         New York West 57th Street
## 3    Staten Island  Sinclair Avenue
## 4         New York East 55th Street
## 5         New York East 64th Street
## 6         Brooklyn       Park Place
##                                            FORMATTED_ADDRESS LATITUDE LONGITUDE
## 1 Regis Residence, 2 E 55th St #803, New York, NY 10022, USA 40.76125 -73.97448
## 2                     217 W 57th St, New York, NY 10019, USA 40.76639 -73.98099
## 3             620 Sinclair Ave, Staten Island, NY 10312, USA 40.54181 -74.19611
## 4                       2 E 55th St, New York, NY 10022, USA 40.76140 -73.97461
## 5                       5 E 64th St, New York, NY 10065, USA 40.76722 -73.96986
## 6                       584 Park Pl, Brooklyn, NY 11238, USA 40.67436 -73.95872
##   special_tag.x special_tag.y special_tag
## 1                                        
## 2                                        
## 3                                        
## 4                                        
## 5                                        
## 6
# Loading the necessary libraries
library(dplyr)
library(ggplot2)


# Visualization for average price by property type
ggplot(grouped_property_type, aes(x = TYPE, y = avg_price, fill = special_tag)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Price by Property Type", x = "Property Type", y = "Average Price")

# Bar plot for average price by number of bedrooms
ggplot(grouped_bedrooms, aes(x = BEDS, y = avg_price, fill = special_tag)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Price by Number of Bedrooms", x = "Number of Bedrooms", y = "Average Price")

# Bar plot for average price by neighborhood
ggplot(grouped_neighborhood, aes(x = LOCALITY, y = avg_price, fill = special_tag)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Average Price by Neighborhood", x = "Neighborhood", y = "Average Price")

# Check for missing combinations
missing_combinations <- expand.grid(TYPE = unique(NY_House_Dataset$TYPE), BATH = unique(NY_House_Dataset$BATH)) %>%
  anti_join(NY_House_Dataset %>% select(TYPE, BATH), by = c("TYPE", "BATH"))

# Display missing combinations
missing_combinations
##                           TYPE      BATH
## 1        Mobile house for sale  2.000000
## 2               Co-op for sale 10.000000
## 3                     For sale 10.000000
## 4                   Contingent 10.000000
## 5                Land for sale 10.000000
## 6                  Foreclosure 10.000000
## 7                  Coming Soon 10.000000
## 8        Mobile house for sale 10.000000
## 9              Condop for sale 10.000000
## 10               Land for sale  1.000000
## 11                 Foreclosure  1.000000
## 12                 Coming Soon  1.000000
## 13       Mobile house for sale  1.000000
## 14                 Coming Soon  2.373861
## 15       Mobile house for sale  2.373861
## 16             Condop for sale  2.373861
## 17              Condo for sale 16.000000
## 18          Townhouse for sale 16.000000
## 19              Co-op for sale 16.000000
## 20                    For sale 16.000000
## 21                  Contingent 16.000000
## 22               Land for sale 16.000000
## 23                 Foreclosure 16.000000
## 24                     Pending 16.000000
## 25                 Coming Soon 16.000000
## 26       Mobile house for sale 16.000000
## 27             Condop for sale 16.000000
## 28               Land for sale  3.000000
## 29             Condop for sale  3.000000
## 30               Land for sale  4.000000
## 31                 Coming Soon  4.000000
## 32       Mobile house for sale  4.000000
## 33             Condop for sale  4.000000
## 34               Land for sale  6.000000
## 35                 Coming Soon  6.000000
## 36       Mobile house for sale  6.000000
## 37             Condop for sale  6.000000
## 38                    For sale  8.000000
## 39                  Contingent  8.000000
## 40               Land for sale  8.000000
## 41                 Foreclosure  8.000000
## 42                     Pending  8.000000
## 43                 Coming Soon  8.000000
## 44       Mobile house for sale  8.000000
## 45             Condop for sale  8.000000
## 46              Co-op for sale  5.000000
## 47                    For sale  5.000000
## 48               Land for sale  5.000000
## 49                 Foreclosure  5.000000
## 50                 Coming Soon  5.000000
## 51       Mobile house for sale  5.000000
## 52             Condop for sale  5.000000
## 53              Condo for sale  9.000000
## 54              House for sale  9.000000
## 55              Co-op for sale  9.000000
## 56                    For sale  9.000000
## 57               Land for sale  9.000000
## 58                 Foreclosure  9.000000
## 59                     Pending  9.000000
## 60                 Coming Soon  9.000000
## 61       Mobile house for sale  9.000000
## 62             Condop for sale  9.000000
## 63              Co-op for sale  7.000000
## 64                    For sale  7.000000
## 65                  Contingent  7.000000
## 66               Land for sale  7.000000
## 67                 Foreclosure  7.000000
## 68                 Coming Soon  7.000000
## 69       Mobile house for sale  7.000000
## 70             Condop for sale  7.000000
## 71              Condo for sale 32.000000
## 72              House for sale 32.000000
## 73          Townhouse for sale 32.000000
## 74              Co-op for sale 32.000000
## 75                    For sale 32.000000
## 76                  Contingent 32.000000
## 77               Land for sale 32.000000
## 78                 Foreclosure 32.000000
## 79                     Pending 32.000000
## 80                 Coming Soon 32.000000
## 81       Mobile house for sale 32.000000
## 82             Condop for sale 32.000000
## 83              Condo for sale 13.000000
## 84              House for sale 13.000000
## 85              Co-op for sale 13.000000
## 86  Multi-family home for sale 13.000000
## 87                    For sale 13.000000
## 88                  Contingent 13.000000
## 89               Land for sale 13.000000
## 90                 Foreclosure 13.000000
## 91                     Pending 13.000000
## 92                 Coming Soon 13.000000
## 93       Mobile house for sale 13.000000
## 94             Condop for sale 13.000000
## 95              Condo for sale 50.000000
## 96              House for sale 50.000000
## 97          Townhouse for sale 50.000000
## 98              Co-op for sale 50.000000
## 99                    For sale 50.000000
## 100                 Contingent 50.000000
## 101              Land for sale 50.000000
## 102                Foreclosure 50.000000
## 103                    Pending 50.000000
## 104                Coming Soon 50.000000
## 105      Mobile house for sale 50.000000
## 106            Condop for sale 50.000000
## 107             Condo for sale 20.000000
## 108             House for sale 20.000000
## 109         Townhouse for sale 20.000000
## 110             Co-op for sale 20.000000
## 111                   For sale 20.000000
## 112                 Contingent 20.000000
## 113              Land for sale 20.000000
## 114                Foreclosure 20.000000
## 115                    Pending 20.000000
## 116                Coming Soon 20.000000
## 117      Mobile house for sale 20.000000
## 118            Condop for sale 20.000000
## 119             Condo for sale 11.000000
## 120             House for sale 11.000000
## 121             Co-op for sale 11.000000
## 122                   For sale 11.000000
## 123                 Contingent 11.000000
## 124              Land for sale 11.000000
## 125                Foreclosure 11.000000
## 126                    Pending 11.000000
## 127                Coming Soon 11.000000
## 128      Mobile house for sale 11.000000
## 129            Condop for sale 11.000000
## 130             Condo for sale 12.000000
## 131         Townhouse for sale 12.000000
## 132             Co-op for sale 12.000000
## 133                   For sale 12.000000
## 134                 Contingent 12.000000
## 135              Land for sale 12.000000
## 136                Foreclosure 12.000000
## 137                    Pending 12.000000
## 138                Coming Soon 12.000000
## 139      Mobile house for sale 12.000000
## 140            Condop for sale 12.000000
## 141             Condo for sale 24.000000
## 142             House for sale 24.000000
## 143         Townhouse for sale 24.000000
## 144             Co-op for sale 24.000000
## 145                   For sale 24.000000
## 146                 Contingent 24.000000
## 147              Land for sale 24.000000
## 148                Foreclosure 24.000000
## 149                    Pending 24.000000
## 150                Coming Soon 24.000000
## 151      Mobile house for sale 24.000000
## 152            Condop for sale 24.000000
## 153             Condo for sale 43.000000
## 154             House for sale 43.000000
## 155         Townhouse for sale 43.000000
## 156             Co-op for sale 43.000000
## 157                   For sale 43.000000
## 158                 Contingent 43.000000
## 159              Land for sale 43.000000
## 160                Foreclosure 43.000000
## 161                    Pending 43.000000
## 162                Coming Soon 43.000000
## 163      Mobile house for sale 43.000000
## 164            Condop for sale 43.000000
## 165             Condo for sale  0.000000
## 166             House for sale  0.000000
## 167             Co-op for sale  0.000000
## 168 Multi-family home for sale  0.000000
## 169                   For sale  0.000000
## 170                 Contingent  0.000000
## 171              Land for sale  0.000000
## 172                Foreclosure  0.000000
## 173                Coming Soon  0.000000
## 174      Mobile house for sale  0.000000
## 175            Condop for sale  0.000000
## 176             Condo for sale 17.000000
## 177             House for sale 17.000000
## 178         Townhouse for sale 17.000000
## 179             Co-op for sale 17.000000
## 180                   For sale 17.000000
## 181                 Contingent 17.000000
## 182              Land for sale 17.000000
## 183                Foreclosure 17.000000
## 184                    Pending 17.000000
## 185                Coming Soon 17.000000
## 186      Mobile house for sale 17.000000
## 187            Condop for sale 17.000000
# Count occurrences of each combination
combination_counts <- NY_House_Dataset %>%
  group_by(TYPE, BATH) %>%
  summarise(count = n())
## `summarise()` has grouped output by 'TYPE'. You can override using the
## `.groups` argument.
# Display most and least common combinations
most_common_combination <- combination_counts %>% filter(count == max(count))
least_common_combination <- combination_counts %>% filter(count == min(count))

most_common_combination
## # A tibble: 14 × 3
## # Groups:   TYPE [13]
##    TYPE                        BATH count
##    <chr>                      <dbl> <int>
##  1 Co-op for sale              1     1012
##  2 Coming Soon                 2        1
##  3 Coming Soon                 3        1
##  4 Condo for sale              2      410
##  5 Condop for sale             1        4
##  6 Contingent                  2       34
##  7 For sale                    1        8
##  8 Foreclosure                 2        7
##  9 House for sale              2      531
## 10 Land for sale               2.37    48
## 11 Mobile house for sale       3        1
## 12 Multi-family home for sale  3      192
## 13 Pending                     2       99
## 14 Townhouse for sale          2       75
least_common_combination
## # A tibble: 27 × 3
## # Groups:   TYPE [13]
##    TYPE             BATH count
##    <chr>           <dbl> <int>
##  1 Co-op for sale   8        3
##  2 Coming Soon      2        1
##  3 Coming Soon      3        1
##  4 Condo for sale   7        1
##  5 Condo for sale   8        1
##  6 Condo for sale  10        1
##  7 Condop for sale  2        1
##  8 Contingent       9        1
##  9 For sale         2.37     1
## 10 For sale         6        1
## # ℹ 17 more rows
# Bar plot for counts of property type and bathroom combinations
ggplot(combination_counts, aes(x = interaction(TYPE, BATH), y = count, fill = interaction(TYPE, BATH))) +
  geom_bar(stat = "identity") +
  labs(title = "Count of Property Type and Bathroom Combinations", x = "Combination", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Numeric Summary Explanation
# Display numeric summary for the 'PRICE' and 'PROPERTYSQFT' columns
summary(NY_House_Dataset[c("PRICE", "PROPERTYSQFT")])
##      PRICE            PROPERTYSQFT  
##  Min.   :2.494e+03   Min.   :  230  
##  1st Qu.:4.990e+05   1st Qu.: 1200  
##  Median :8.250e+05   Median : 2184  
##  Mean   :2.357e+06   Mean   : 2184  
##  3rd Qu.:1.495e+06   3rd Qu.: 2184  
##  Max.   :2.147e+09   Max.   :65535
# Explanation to Reader:
# The numeric summary provides an overview of the distribution of property prices and square footage.
# Insights: The average property price and square footage can help understand the central tendency and spread of the data.



# Visualization for average price by property type
# Explanation to Reader:
# The bar plot visualizes the average price for different property types.
# Insights: It highlights variations in property prices across types, identifying potential trends or disparities in the market.

# Combination Analysis Explanation
# Check for missing combinations of property type and bathrooms
# Explanation to Reader:
# Identifying missing combinations helps understand data gaps.
# Insights: Some combinations may be absent, indicating limited availability or potential data collection issues.

# Further Questions:
further_questions <- c(
  "What factors contribute to the scarcity of certain property types or bedroom configurations in the market?",
  "Are there specific characteristics of neighborhoods that explain their varying property prices?",
  "How do missing combinations impact the overall understanding of the real estate market, and what strategies can be employed to address data gaps?"
)

further_questions
## [1] "What factors contribute to the scarcity of certain property types or bedroom configurations in the market?"                                       
## [2] "Are there specific characteristics of neighborhoods that explain their varying property prices?"                                                  
## [3] "How do missing combinations impact the overall understanding of the real estate market, and what strategies can be employed to address data gaps?"
# Explanation to Reader:
# These questions guide further exploration and analysis.
# Significance: Addressing these questions could provide deeper insights into market dynamics and improve data completeness.