library(readr)
library(ggplot2)
library(readr)
df <- read_csv("C:/Users/ujjwa/Downloads/data science/archive/realtor-data.zip.csv")
## Rows: 2226382 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): status, city, state, zip_code
## dbl (7): brokered_by, price, bed, bath, acre_lot, street, house_size
## date (1): prev_sold_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- as.data.frame(df)
dim(df)
## [1] 2226382 12
head(df)
## brokered_by status price bed bath acre_lot street city
## 1 103378 for_sale 105000 3 2 0.12 1962661 Adjuntas
## 2 52707 for_sale 80000 4 2 0.08 1902874 Adjuntas
## 3 103379 for_sale 67000 2 1 0.15 1404990 Juana Diaz
## 4 31239 for_sale 145000 4 2 0.10 1947675 Ponce
## 5 34632 for_sale 65000 6 2 0.05 331151 Mayaguez
## 6 103378 for_sale 179000 4 3 0.46 1850806 San Sebastian
## state zip_code house_size prev_sold_date
## 1 Puerto Rico 00601 920 <NA>
## 2 Puerto Rico 00601 1527 <NA>
## 3 Puerto Rico 00795 748 <NA>
## 4 Puerto Rico 00731 1800 <NA>
## 5 Puerto Rico 00680 NA <NA>
## 6 Puerto Rico 00612 2520 <NA>
summary(df)
## brokered_by status price bed
## Min. : 0 Length:2226382 Min. :0.000e+00 Min. : 1.00
## 1st Qu.: 23861 Class :character 1st Qu.:1.650e+05 1st Qu.: 3.00
## Median : 52884 Mode :character Median :3.250e+05 Median : 3.00
## Mean : 52940 Mean :5.242e+05 Mean : 3.28
## 3rd Qu.: 79183 3rd Qu.:5.500e+05 3rd Qu.: 4.00
## Max. :110142 Max. :2.147e+09 Max. :473.00
## NA's :4533 NA's :1541 NA's :481317
## bath acre_lot street city
## Min. : 1.0 Min. : 0.00 Min. : 0 Length:2226382
## 1st Qu.: 2.0 1st Qu.: 0.15 1st Qu.: 506313 Class :character
## Median : 2.0 Median : 0.26 Median :1012766 Mode :character
## Mean : 2.5 Mean : 15.22 Mean :1012325
## 3rd Qu.: 3.0 3rd Qu.: 0.98 3rd Qu.:1521173
## Max. :830.0 Max. :100000.00 Max. :2001357
## NA's :511771 NA's :325589 NA's :10866
## state zip_code house_size prev_sold_date
## Length:2226382 Length:2226382 Min. :4.000e+00 Min. :1901-01-01
## Class :character Class :character 1st Qu.:1.300e+03 1st Qu.:2016-08-09
## Mode :character Mode :character Median :1.760e+03 Median :2021-12-01
## Mean :2.714e+03 Mean :2017-08-16
## 3rd Qu.:2.413e+03 3rd Qu.:2022-03-04
## Max. :1.040e+09 Max. :3019-04-02
## NA's :568484 NA's :734297
colSums(is.na(df))
## brokered_by status price bed bath
## 4533 0 1541 481317 511771
## acre_lot street city state zip_code
## 325589 10866 1407 8 299
## house_size prev_sold_date
## 568484 734297
hist(df$price[df$price<1000000],
main = "Price Distribution",
xlab = "Price",
col = "skyblue")
state_count <- table(df$state)
barplot(state_count,
main = "Listings per State",
col = "orange",
las = 2)
scatter_data <- df[!is.na(df$bed) & !is.na(df$price), ]
plot(scatter_data$bed,
scatter_data$price,
main = "Scatter Plot: Price vs Bedrooms",
xlab = "Bedrooms",
ylab = "Price",
col = "blue",
pch = 19)
plot(df$house_size, df$price,
main = "House Size vs Price",
xlab = "House Size",
ylab = "Price",
pch = 16,
col = rgb(0,0,1,0.3))
abline(lm(price ~ house_size, data = df), col = "red", lwd = 2)
plot(df$bath, df$price,
main = "Bathrooms vs Price",
xlab = "Bathrooms",
ylab = "Price",
pch = 16,
col = rgb(1,0,0,0.3))
top10 <- head(df[order(-df$price), ], 10)
top10[, c("city", "state", "price")]
## city state price
## 221995 International California 2147483600
## 941988 Eureka Kansas 1000000000
## 72904 Bronx New York 875000000
## 1288465 San Diego California 515000000
## 2054445 Los Angeles California 295000000
## 375791 Atlanta Georgia 281500000
## 1258957 Beverly Hills California 250000000
## 1051158 Kosse Texas 212500000
## 105335 New York City New York 169000000
## 1258585 Los Angeles California 165000000
state_avg <- aggregate(price ~ state, data = df, mean)
head(state_avg[order(-state_avg$price), ])
## state price
## 13 Hawaii 1240095.3
## 5 California 1029223.7
## 35 New York 943796.8
## 9 District of Columbia 882849.8
## 48 Utah 881418.4
## 6 Colorado 832179.3
cor(df[, c("price", "bed", "bath", "house_size")],
use = "complete.obs")
## price bed bath house_size
## price 1.00000000 0.1149881 0.2019397 0.07585679
## bed 0.11498812 1.0000000 0.6656029 0.18708478
## bath 0.20193973 0.6656029 1.0000000 0.22573485
## house_size 0.07585679 0.1870848 0.2257349 1.00000000
boxplot(df$price,
main = "Price Outliers",
col = "yellow")
df$price_range <- cut(df$price,
breaks = c(0, 200000, 500000, 1000000, Inf),
labels = c("Low", "Medium", "High", "Luxury"))
barplot(table(df$price_range),
main = "Price Range Distribution",
col = c("green", "blue", "orange", "red"))
aggregate(price ~ bed, data = df, mean)
## bed price
## 1 1 372766.0
## 2 2 403458.8
## 3 3 451811.0
## 4 4 656969.9
## 5 5 1073591.7
## 6 6 1605613.3
## 7 7 2397309.6
## 8 8 1893457.2
## 9 9 2543846.5
## 10 10 2602077.6
## 11 11 2994711.3
## 12 12 2609231.8
## 13 13 3118753.2
## 14 14 4234846.3
## 15 15 2465431.2
## 16 16 2453852.9
## 17 17 4596060.3
## 18 18 2565537.2
## 19 19 8043875.4
## 20 20 2957435.2
## 21 21 9500368.6
## 22 22 3670463.6
## 23 23 3963108.8
## 24 24 3053511.0
## 25 25 3095143.0
## 26 26 3331281.0
## 27 27 2493690.9
## 28 28 3496972.3
## 29 29 4981126.6
## 30 30 3883862.6
## 31 31 3785332.5
## 32 32 3640240.8
## 33 33 3462426.5
## 34 34 4650714.3
## 35 35 5974362.5
## 36 36 4436903.8
## 37 37 6578333.3
## 38 38 3169222.2
## 39 39 5116666.7
## 40 40 6176787.5
## 41 41 2957800.0
## 42 42 5013205.2
## 43 43 7189499.8
## 44 44 4814285.7
## 45 45 11331666.7
## 46 46 7869166.7
## 47 47 3251166.7
## 48 48 4101153.8
## 49 49 8658333.3
## 50 50 3324750.0
## 51 52 7192500.0
## 52 53 6900000.0
## 53 54 7762125.0
## 54 55 7190000.0
## 55 56 18430000.0
## 56 57 8000000.0
## 57 60 7650000.0
## 58 61 9300000.0
## 59 62 6600000.0
## 60 63 6581666.7
## 61 64 3812362.5
## 62 66 6949975.0
## 63 67 1300000.0
## 64 68 8475000.0
## 65 69 3500000.0
## 66 70 11520000.0
## 67 71 4150000.0
## 68 72 7405714.1
## 69 73 23000000.0
## 70 74 3457812.5
## 71 75 1890000.0
## 72 76 2350000.0
## 73 80 4333333.3
## 74 82 3250000.0
## 75 84 24500000.0
## 76 86 15150000.0
## 77 88 17261000.0
## 78 90 11501950.0
## 79 93 10890000.0
## 80 96 7900000.0
## 81 98 1499000.0
## 82 99 10811000.0
## 83 100 4000000.0
## 84 102 1499000.0
## 85 108 13500000.0
## 86 110 2100000.0
## 87 111 24898000.0
## 88 114 2499000.0
## 89 120 7000000.0
## 90 123 120000000.0
## 91 136 5500000.0
## 92 142 2100000.0
## 93 148 549000.0
## 94 190 189000.0
## 95 210 575000.0
## 96 212 1695000.0
## 97 222 699000.0
## 98 444 435000.0
## 99 473 225000.0
ggplot(df, aes(x = house_size, y = price)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", color = "red") +
ggtitle("House Size vs Price with Regression")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 569415 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 569415 rows containing missing values or values outside the scale range
## (`geom_point()`).
barplot(table(df$status),
main = "Listings by Status",
col = c("purple", "green", "orange"))
hist(df$bath,
main = "Bathrooms Distribution",
xlab = "Bathrooms",
col = "pink")
city_max <- aggregate(price ~ city, data = df, max)
head(city_max[order(-city_max$price), ])
## city price
## 8493 International 2147483600
## 5662 Eureka 1000000000
## 2102 Bronx 875000000
## 15818 San Diego 515000000
## 10270 Los Angeles 295000000
## 702 Atlanta 281500000
plot(df$bed, df$bath,
main = "Bedrooms vs Bathrooms",
xlab = "Bedrooms",
ylab = "Bathrooms",
col = "darkblue")
model <- lm(price ~ house_size + bed + bath, data = df)
summary(model)
##
## Call:
## lm(formula = price ~ house_size + bed + bath, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -160774168 -289796 -112516 86862 2147028430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.482e+05 4.314e+03 -34.36 <2e-16 ***
## house_size 1.882e+01 4.481e-01 42.00 <2e-16 ***
## bed -5.746e+04 1.606e+03 -35.79 <2e-16 ***
## bath 3.508e+05 1.673e+03 209.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2174000 on 1605505 degrees of freedom
## (620873 observations deleted due to missingness)
## Multiple R-squared: 0.04251, Adjusted R-squared: 0.04251
## F-statistic: 2.376e+04 on 3 and 1605505 DF, p-value: < 2.2e-16
plot(model$fitted.values, model$residuals,
main = "Residual Plot",
xlab = "Fitted Values",
ylab = "Residuals",
col = "blue")
abline(h = 0, col = "red")
boxplot(price ~ state,
data = df,
main = "Price by State",
col = "lightblue",
las = 2)
plot(density(df$price, na.rm = TRUE),
main = "Price Density",
col = "darkgreen")
corr_matrix <- cor(df[, c("price", "bed", "bath", "house_size")],
use = "complete.obs")
image(1:ncol(corr_matrix), 1:nrow(corr_matrix), corr_matrix,
axes = FALSE, col = heat.colors(10))
axis(1, 1:ncol(corr_matrix), colnames(corr_matrix))
axis(2, 1:nrow(corr_matrix), rownames(corr_matrix))
bed_count <- table(df$bed)
plot(bed_count,
type = "b",
main = "Listings by Bedrooms",
col = "purple")
boxplot(price ~ status,
data = df,
main = "Price by Status",
col = "orange")
hist(log(df$price),
main = "Log Price Distribution",
col = "cyan")
df$size_cat <- cut(df$house_size,
breaks = c(0, 1000, 2000, 3000, Inf),
labels = c("Small", "Medium", "Large", "Luxury"))
boxplot(price ~ size_cat,
data = df,
main = "Price by Size Category",
col = c("green", "blue", "orange", "red"))
ggplot(df, aes(x = house_size, y = price, color = factor(bed))) +
geom_point(alpha = 0.5) +
ggtitle("House Size vs Price (Bedrooms Colored)")
## Warning: Removed 569415 rows containing missing values or values outside the scale range
## (`geom_point()`).
bath_avg <- aggregate(price ~ bath, data = df, mean)
plot(bath_avg$bath, bath_avg$price,
type = "l",
main = "Avg Price by Bathrooms",
xlab = "Bathrooms",
ylab = "Price",
col = "red")
## Q27 Linear Regression
model1 <- lm(price ~ house_size, data = df)
summary(model1)
##
## Call:
## lm(formula = price ~ house_size, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1138792 -346027 -206028 18872 2146902573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.810e+05 1.705e+03 340.757 <2e-16 ***
## house_size 6.899e-04 2.109e-03 0.327 0.744
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2195000 on 1656965 degrees of freedom
## (569415 observations deleted due to missingness)
## Multiple R-squared: 6.456e-08, Adjusted R-squared: -5.389e-07
## F-statistic: 0.107 on 1 and 1656965 DF, p-value: 0.7436
Interpretation: Size impact on price. ## Q29 Polynomial Regression
df <- as.data.frame(df)
# Remove rows with missing values in required columns
df <- na.omit(df[, c("price", "house_size", "bed", "bath")])
model3 <- lm(price ~ poly(house_size, 2), data = df)
summary(model3)
##
## Call:
## lm(formula = price ~ poly(house_size, 2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49882638 -278715 -177113 4127 2147116015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 580211 1731 335.11 <2e-16 ***
## poly(house_size, 2)1 213509056 2193857 97.32 <2e-16 ***
## poly(house_size, 2)2 -386342002 2193857 -176.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2194000 on 1605506 degrees of freedom
## Multiple R-squared: 0.0246, Adjusted R-squared: 0.02459
## F-statistic: 2.024e+04 on 2 and 1605506 DF, p-value: < 2.2e-16
Interpretation: Captures non-linear trend.
plot(df$house_size, df$price)
abline(lm(price ~ house_size, data = df), col = "red")
Interpretation: Model fit visualization.
cat("Key Insights:\n")
## Key Insights:
cat("- House size strongly affects price\n")
## - House size strongly affects price
cat("- Price distribution is right skewed\n")
## - Price distribution is right skewed
cat("- Bathrooms impact price more than bedrooms\n")
## - Bathrooms impact price more than bedrooms
cat("- Larger homes = higher price\n")
## - Larger homes = higher price
cat("- Regression model shows reasonable prediction\n")
## - Regression model shows reasonable prediction