# Load the data from the CSV file
NY_House_Dataset <- read.csv("C:\\Users\\velag\\Downloads\\NY-House-Dataset.csv")
# Display summary statistics for the entire data frame
summary(NY_House_Dataset)
## BROKERTITLE TYPE PRICE BEDS
## Length:4801 Length:4801 Min. :2.494e+03 Min. : 1.000
## Class :character Class :character 1st Qu.:4.990e+05 1st Qu.: 2.000
## Mode :character Mode :character Median :8.250e+05 Median : 3.000
## Mean :2.357e+06 Mean : 3.357
## 3rd Qu.:1.495e+06 3rd Qu.: 4.000
## Max. :2.147e+09 Max. :50.000
## BATH PROPERTYSQFT ADDRESS STATE
## Min. : 0.000 Min. : 230 Length:4801 Length:4801
## 1st Qu.: 1.000 1st Qu.: 1200 Class :character Class :character
## Median : 2.000 Median : 2184 Mode :character Mode :character
## Mean : 2.374 Mean : 2184
## 3rd Qu.: 3.000 3rd Qu.: 2184
## Max. :50.000 Max. :65535
## ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## Length:4801 Length:4801 Length:4801
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## STREET_NAME LONG_NAME FORMATTED_ADDRESS LATITUDE
## Length:4801 Length:4801 Length:4801 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.64
## Mode :character Mode :character Mode :character Median :40.73
## Mean :40.71
## 3rd Qu.:40.77
## Max. :40.91
## LONGITUDE
## Min. :-74.25
## 1st Qu.:-73.99
## Median :-73.95
## Mean :-73.94
## 3rd Qu.:-73.87
## Max. :-73.70
# Loading the necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Group by property type and summarize price
grouped_property_type <- NY_House_Dataset %>%
group_by(TYPE) %>%
summarise(avg_price = mean(PRICE), total_properties = n())
# Add a special tag to the smallest group
grouped_property_type$special_tag <- ifelse(grouped_property_type$total_properties == min(grouped_property_type$total_properties), "Smallest Group", "")
# Translate the special tag back to the original data frame
NY_House_Dataset <- left_join(NY_House_Dataset, grouped_property_type %>% select(TYPE, special_tag), by = "TYPE")
# Display the resulting data frame
head(NY_House_Dataset)
## BROKERTITLE
## 1 Brokered by Douglas Elliman -111 Fifth Ave
## 2 Brokered by Serhant
## 3 Brokered by Sowae Corp
## 4 Brokered by COMPASS
## 5 Brokered by Sotheby's International Realty - East Side Manhattan Brokerage
## 6 Brokered by Sowae Corp
## TYPE PRICE BEDS BATH PROPERTYSQFT
## 1 Condo for sale 315000 2 2.000000 1400
## 2 Condo for sale 195000000 7 10.000000 17545
## 3 House for sale 260000 4 2.000000 2015
## 4 Condo for sale 69000 3 1.000000 445
## 5 Townhouse for sale 55000000 7 2.373861 14175
## 6 House for sale 690000 5 2.000000 4004
## ADDRESS
## 1 2 E 55th St Unit 803
## 2 Central Park Tower Penthouse-217 W 57th New York St Unit Penthouse
## 3 620 Sinclair Ave
## 4 2 E 55th St Unit 908W33
## 5 5 E 64th St
## 6 584 Park Pl
## STATE ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## 1 New York, NY 10022 New York County New York Manhattan
## 2 New York, NY 10019 United States New York New York County
## 3 Staten Island, NY 10312 United States New York Richmond County
## 4 Manhattan, NY 10022 United States New York New York County
## 5 New York, NY 10065 United States New York New York County
## 6 Brooklyn, NY 11238 United States New York Kings County
## STREET_NAME LONG_NAME
## 1 East 55th Street Regis Residence
## 2 New York West 57th Street
## 3 Staten Island Sinclair Avenue
## 4 New York East 55th Street
## 5 New York East 64th Street
## 6 Brooklyn Park Place
## FORMATTED_ADDRESS LATITUDE LONGITUDE
## 1 Regis Residence, 2 E 55th St #803, New York, NY 10022, USA 40.76125 -73.97448
## 2 217 W 57th St, New York, NY 10019, USA 40.76639 -73.98099
## 3 620 Sinclair Ave, Staten Island, NY 10312, USA 40.54181 -74.19611
## 4 2 E 55th St, New York, NY 10022, USA 40.76140 -73.97461
## 5 5 E 64th St, New York, NY 10065, USA 40.76722 -73.96986
## 6 584 Park Pl, Brooklyn, NY 11238, USA 40.67436 -73.95872
## special_tag
## 1
## 2
## 3
## 4
## 5
## 6
# Grouping 2: Number of Bedrooms
grouped_bedrooms <- NY_House_Dataset %>%
group_by(BEDS) %>%
summarise(avg_price = mean(PRICE), total_properties = n())
grouped_bedrooms$special_tag <- ifelse(grouped_bedrooms$total_properties == min(grouped_bedrooms$total_properties), "Smallest Group", "")
NY_House_Dataset <- left_join(NY_House_Dataset, grouped_bedrooms %>% select(BEDS, special_tag), by = "BEDS")
# Explanation to Reader:
# The bar plot visualizes the average price for different numbers of bedrooms.
# Insights: It provides insights into how the number of bedrooms influences property prices.
# Grouping 3: Neighborhood
grouped_neighborhood <- NY_House_Dataset %>%
group_by(LOCALITY) %>%
summarise(avg_price = mean(PRICE), total_properties = n())
grouped_neighborhood$special_tag <- ifelse(grouped_neighborhood$total_properties == min(grouped_neighborhood$total_properties), "Smallest Group", "")
NY_House_Dataset <- left_join(NY_House_Dataset, grouped_neighborhood %>% select(LOCALITY, special_tag), by = "LOCALITY")
# Explanation to Reader:
# The bar plot visualizes the average price for different neighborhoods.
# Insights: It highlights variations in property prices across neighborhoods, aiding in understanding localized market dynamics.
# Display the first few rows of the updated data frame
head(NY_House_Dataset)
## BROKERTITLE
## 1 Brokered by Douglas Elliman -111 Fifth Ave
## 2 Brokered by Serhant
## 3 Brokered by Sowae Corp
## 4 Brokered by COMPASS
## 5 Brokered by Sotheby's International Realty - East Side Manhattan Brokerage
## 6 Brokered by Sowae Corp
## TYPE PRICE BEDS BATH PROPERTYSQFT
## 1 Condo for sale 315000 2 2.000000 1400
## 2 Condo for sale 195000000 7 10.000000 17545
## 3 House for sale 260000 4 2.000000 2015
## 4 Condo for sale 69000 3 1.000000 445
## 5 Townhouse for sale 55000000 7 2.373861 14175
## 6 House for sale 690000 5 2.000000 4004
## ADDRESS
## 1 2 E 55th St Unit 803
## 2 Central Park Tower Penthouse-217 W 57th New York St Unit Penthouse
## 3 620 Sinclair Ave
## 4 2 E 55th St Unit 908W33
## 5 5 E 64th St
## 6 584 Park Pl
## STATE ADMINISTRATIVE_AREA_LEVEL_2 LOCALITY SUBLOCALITY
## 1 New York, NY 10022 New York County New York Manhattan
## 2 New York, NY 10019 United States New York New York County
## 3 Staten Island, NY 10312 United States New York Richmond County
## 4 Manhattan, NY 10022 United States New York New York County
## 5 New York, NY 10065 United States New York New York County
## 6 Brooklyn, NY 11238 United States New York Kings County
## STREET_NAME LONG_NAME
## 1 East 55th Street Regis Residence
## 2 New York West 57th Street
## 3 Staten Island Sinclair Avenue
## 4 New York East 55th Street
## 5 New York East 64th Street
## 6 Brooklyn Park Place
## FORMATTED_ADDRESS LATITUDE LONGITUDE
## 1 Regis Residence, 2 E 55th St #803, New York, NY 10022, USA 40.76125 -73.97448
## 2 217 W 57th St, New York, NY 10019, USA 40.76639 -73.98099
## 3 620 Sinclair Ave, Staten Island, NY 10312, USA 40.54181 -74.19611
## 4 2 E 55th St, New York, NY 10022, USA 40.76140 -73.97461
## 5 5 E 64th St, New York, NY 10065, USA 40.76722 -73.96986
## 6 584 Park Pl, Brooklyn, NY 11238, USA 40.67436 -73.95872
## special_tag.x special_tag.y special_tag
## 1
## 2
## 3
## 4
## 5
## 6
# Loading the necessary libraries
library(dplyr)
library(ggplot2)
# Visualization for average price by property type
ggplot(grouped_property_type, aes(x = TYPE, y = avg_price, fill = special_tag)) +
geom_bar(stat = "identity") +
labs(title = "Average Price by Property Type", x = "Property Type", y = "Average Price")

# Bar plot for average price by number of bedrooms
ggplot(grouped_bedrooms, aes(x = BEDS, y = avg_price, fill = special_tag)) +
geom_bar(stat = "identity") +
labs(title = "Average Price by Number of Bedrooms", x = "Number of Bedrooms", y = "Average Price")

# Bar plot for average price by neighborhood
ggplot(grouped_neighborhood, aes(x = LOCALITY, y = avg_price, fill = special_tag)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Average Price by Neighborhood", x = "Neighborhood", y = "Average Price")

# Check for missing combinations
missing_combinations <- expand.grid(TYPE = unique(NY_House_Dataset$TYPE), BATH = unique(NY_House_Dataset$BATH)) %>%
anti_join(NY_House_Dataset %>% select(TYPE, BATH), by = c("TYPE", "BATH"))
# Display missing combinations
missing_combinations
## TYPE BATH
## 1 Mobile house for sale 2.000000
## 2 Co-op for sale 10.000000
## 3 For sale 10.000000
## 4 Contingent 10.000000
## 5 Land for sale 10.000000
## 6 Foreclosure 10.000000
## 7 Coming Soon 10.000000
## 8 Mobile house for sale 10.000000
## 9 Condop for sale 10.000000
## 10 Land for sale 1.000000
## 11 Foreclosure 1.000000
## 12 Coming Soon 1.000000
## 13 Mobile house for sale 1.000000
## 14 Coming Soon 2.373861
## 15 Mobile house for sale 2.373861
## 16 Condop for sale 2.373861
## 17 Condo for sale 16.000000
## 18 Townhouse for sale 16.000000
## 19 Co-op for sale 16.000000
## 20 For sale 16.000000
## 21 Contingent 16.000000
## 22 Land for sale 16.000000
## 23 Foreclosure 16.000000
## 24 Pending 16.000000
## 25 Coming Soon 16.000000
## 26 Mobile house for sale 16.000000
## 27 Condop for sale 16.000000
## 28 Land for sale 3.000000
## 29 Condop for sale 3.000000
## 30 Land for sale 4.000000
## 31 Coming Soon 4.000000
## 32 Mobile house for sale 4.000000
## 33 Condop for sale 4.000000
## 34 Land for sale 6.000000
## 35 Coming Soon 6.000000
## 36 Mobile house for sale 6.000000
## 37 Condop for sale 6.000000
## 38 For sale 8.000000
## 39 Contingent 8.000000
## 40 Land for sale 8.000000
## 41 Foreclosure 8.000000
## 42 Pending 8.000000
## 43 Coming Soon 8.000000
## 44 Mobile house for sale 8.000000
## 45 Condop for sale 8.000000
## 46 Co-op for sale 5.000000
## 47 For sale 5.000000
## 48 Land for sale 5.000000
## 49 Foreclosure 5.000000
## 50 Coming Soon 5.000000
## 51 Mobile house for sale 5.000000
## 52 Condop for sale 5.000000
## 53 Condo for sale 9.000000
## 54 House for sale 9.000000
## 55 Co-op for sale 9.000000
## 56 For sale 9.000000
## 57 Land for sale 9.000000
## 58 Foreclosure 9.000000
## 59 Pending 9.000000
## 60 Coming Soon 9.000000
## 61 Mobile house for sale 9.000000
## 62 Condop for sale 9.000000
## 63 Co-op for sale 7.000000
## 64 For sale 7.000000
## 65 Contingent 7.000000
## 66 Land for sale 7.000000
## 67 Foreclosure 7.000000
## 68 Coming Soon 7.000000
## 69 Mobile house for sale 7.000000
## 70 Condop for sale 7.000000
## 71 Condo for sale 32.000000
## 72 House for sale 32.000000
## 73 Townhouse for sale 32.000000
## 74 Co-op for sale 32.000000
## 75 For sale 32.000000
## 76 Contingent 32.000000
## 77 Land for sale 32.000000
## 78 Foreclosure 32.000000
## 79 Pending 32.000000
## 80 Coming Soon 32.000000
## 81 Mobile house for sale 32.000000
## 82 Condop for sale 32.000000
## 83 Condo for sale 13.000000
## 84 House for sale 13.000000
## 85 Co-op for sale 13.000000
## 86 Multi-family home for sale 13.000000
## 87 For sale 13.000000
## 88 Contingent 13.000000
## 89 Land for sale 13.000000
## 90 Foreclosure 13.000000
## 91 Pending 13.000000
## 92 Coming Soon 13.000000
## 93 Mobile house for sale 13.000000
## 94 Condop for sale 13.000000
## 95 Condo for sale 50.000000
## 96 House for sale 50.000000
## 97 Townhouse for sale 50.000000
## 98 Co-op for sale 50.000000
## 99 For sale 50.000000
## 100 Contingent 50.000000
## 101 Land for sale 50.000000
## 102 Foreclosure 50.000000
## 103 Pending 50.000000
## 104 Coming Soon 50.000000
## 105 Mobile house for sale 50.000000
## 106 Condop for sale 50.000000
## 107 Condo for sale 20.000000
## 108 House for sale 20.000000
## 109 Townhouse for sale 20.000000
## 110 Co-op for sale 20.000000
## 111 For sale 20.000000
## 112 Contingent 20.000000
## 113 Land for sale 20.000000
## 114 Foreclosure 20.000000
## 115 Pending 20.000000
## 116 Coming Soon 20.000000
## 117 Mobile house for sale 20.000000
## 118 Condop for sale 20.000000
## 119 Condo for sale 11.000000
## 120 House for sale 11.000000
## 121 Co-op for sale 11.000000
## 122 For sale 11.000000
## 123 Contingent 11.000000
## 124 Land for sale 11.000000
## 125 Foreclosure 11.000000
## 126 Pending 11.000000
## 127 Coming Soon 11.000000
## 128 Mobile house for sale 11.000000
## 129 Condop for sale 11.000000
## 130 Condo for sale 12.000000
## 131 Townhouse for sale 12.000000
## 132 Co-op for sale 12.000000
## 133 For sale 12.000000
## 134 Contingent 12.000000
## 135 Land for sale 12.000000
## 136 Foreclosure 12.000000
## 137 Pending 12.000000
## 138 Coming Soon 12.000000
## 139 Mobile house for sale 12.000000
## 140 Condop for sale 12.000000
## 141 Condo for sale 24.000000
## 142 House for sale 24.000000
## 143 Townhouse for sale 24.000000
## 144 Co-op for sale 24.000000
## 145 For sale 24.000000
## 146 Contingent 24.000000
## 147 Land for sale 24.000000
## 148 Foreclosure 24.000000
## 149 Pending 24.000000
## 150 Coming Soon 24.000000
## 151 Mobile house for sale 24.000000
## 152 Condop for sale 24.000000
## 153 Condo for sale 43.000000
## 154 House for sale 43.000000
## 155 Townhouse for sale 43.000000
## 156 Co-op for sale 43.000000
## 157 For sale 43.000000
## 158 Contingent 43.000000
## 159 Land for sale 43.000000
## 160 Foreclosure 43.000000
## 161 Pending 43.000000
## 162 Coming Soon 43.000000
## 163 Mobile house for sale 43.000000
## 164 Condop for sale 43.000000
## 165 Condo for sale 0.000000
## 166 House for sale 0.000000
## 167 Co-op for sale 0.000000
## 168 Multi-family home for sale 0.000000
## 169 For sale 0.000000
## 170 Contingent 0.000000
## 171 Land for sale 0.000000
## 172 Foreclosure 0.000000
## 173 Coming Soon 0.000000
## 174 Mobile house for sale 0.000000
## 175 Condop for sale 0.000000
## 176 Condo for sale 17.000000
## 177 House for sale 17.000000
## 178 Townhouse for sale 17.000000
## 179 Co-op for sale 17.000000
## 180 For sale 17.000000
## 181 Contingent 17.000000
## 182 Land for sale 17.000000
## 183 Foreclosure 17.000000
## 184 Pending 17.000000
## 185 Coming Soon 17.000000
## 186 Mobile house for sale 17.000000
## 187 Condop for sale 17.000000
# Count occurrences of each combination
combination_counts <- NY_House_Dataset %>%
group_by(TYPE, BATH) %>%
summarise(count = n())
## `summarise()` has grouped output by 'TYPE'. You can override using the
## `.groups` argument.
# Display most and least common combinations
most_common_combination <- combination_counts %>% filter(count == max(count))
least_common_combination <- combination_counts %>% filter(count == min(count))
most_common_combination
## # A tibble: 14 × 3
## # Groups: TYPE [13]
## TYPE BATH count
## <chr> <dbl> <int>
## 1 Co-op for sale 1 1012
## 2 Coming Soon 2 1
## 3 Coming Soon 3 1
## 4 Condo for sale 2 410
## 5 Condop for sale 1 4
## 6 Contingent 2 34
## 7 For sale 1 8
## 8 Foreclosure 2 7
## 9 House for sale 2 531
## 10 Land for sale 2.37 48
## 11 Mobile house for sale 3 1
## 12 Multi-family home for sale 3 192
## 13 Pending 2 99
## 14 Townhouse for sale 2 75
least_common_combination
## # A tibble: 27 × 3
## # Groups: TYPE [13]
## TYPE BATH count
## <chr> <dbl> <int>
## 1 Co-op for sale 8 3
## 2 Coming Soon 2 1
## 3 Coming Soon 3 1
## 4 Condo for sale 7 1
## 5 Condo for sale 8 1
## 6 Condo for sale 10 1
## 7 Condop for sale 2 1
## 8 Contingent 9 1
## 9 For sale 2.37 1
## 10 For sale 6 1
## # ℹ 17 more rows
# Bar plot for counts of property type and bathroom combinations
ggplot(combination_counts, aes(x = interaction(TYPE, BATH), y = count, fill = interaction(TYPE, BATH))) +
geom_bar(stat = "identity") +
labs(title = "Count of Property Type and Bathroom Combinations", x = "Combination", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Numeric Summary Explanation
# Display numeric summary for the 'PRICE' and 'PROPERTYSQFT' columns
summary(NY_House_Dataset[c("PRICE", "PROPERTYSQFT")])
## PRICE PROPERTYSQFT
## Min. :2.494e+03 Min. : 230
## 1st Qu.:4.990e+05 1st Qu.: 1200
## Median :8.250e+05 Median : 2184
## Mean :2.357e+06 Mean : 2184
## 3rd Qu.:1.495e+06 3rd Qu.: 2184
## Max. :2.147e+09 Max. :65535
# Explanation to Reader:
# The numeric summary provides an overview of the distribution of property prices and square footage.
# Insights: The average property price and square footage can help understand the central tendency and spread of the data.
# Visualization for average price by property type
# Explanation to Reader:
# The bar plot visualizes the average price for different property types.
# Insights: It highlights variations in property prices across types, identifying potential trends or disparities in the market.
# Combination Analysis Explanation
# Check for missing combinations of property type and bathrooms
# Explanation to Reader:
# Identifying missing combinations helps understand data gaps.
# Insights: Some combinations may be absent, indicating limited availability or potential data collection issues.
# Further Questions:
further_questions <- c(
"What factors contribute to the scarcity of certain property types or bedroom configurations in the market?",
"Are there specific characteristics of neighborhoods that explain their varying property prices?",
"How do missing combinations impact the overall understanding of the real estate market, and what strategies can be employed to address data gaps?"
)
further_questions
## [1] "What factors contribute to the scarcity of certain property types or bedroom configurations in the market?"
## [2] "Are there specific characteristics of neighborhoods that explain their varying property prices?"
## [3] "How do missing combinations impact the overall understanding of the real estate market, and what strategies can be employed to address data gaps?"
# Explanation to Reader:
# These questions guide further exploration and analysis.
# Significance: Addressing these questions could provide deeper insights into market dynamics and improve data completeness.