Load Packages

library(readr)
library(ggplot2)

Load Dataset

library(readr)

df <- read_csv("C:/Users/ujjwa/Downloads/data science/archive/realtor-data.zip.csv")
## Rows: 2226382 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): status, city, state, zip_code
## dbl  (7): brokered_by, price, bed, bath, acre_lot, street, house_size
## date (1): prev_sold_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- as.data.frame(df)

Dataset Overview

dim(df)
## [1] 2226382      12
head(df)
##   brokered_by   status  price bed bath acre_lot  street          city
## 1      103378 for_sale 105000   3    2     0.12 1962661      Adjuntas
## 2       52707 for_sale  80000   4    2     0.08 1902874      Adjuntas
## 3      103379 for_sale  67000   2    1     0.15 1404990    Juana Diaz
## 4       31239 for_sale 145000   4    2     0.10 1947675         Ponce
## 5       34632 for_sale  65000   6    2     0.05  331151      Mayaguez
## 6      103378 for_sale 179000   4    3     0.46 1850806 San Sebastian
##         state zip_code house_size prev_sold_date
## 1 Puerto Rico    00601        920           <NA>
## 2 Puerto Rico    00601       1527           <NA>
## 3 Puerto Rico    00795        748           <NA>
## 4 Puerto Rico    00731       1800           <NA>
## 5 Puerto Rico    00680         NA           <NA>
## 6 Puerto Rico    00612       2520           <NA>
summary(df)
##   brokered_by        status              price                bed        
##  Min.   :     0   Length:2226382     Min.   :0.000e+00   Min.   :  1.00  
##  1st Qu.: 23861   Class :character   1st Qu.:1.650e+05   1st Qu.:  3.00  
##  Median : 52884   Mode  :character   Median :3.250e+05   Median :  3.00  
##  Mean   : 52940                      Mean   :5.242e+05   Mean   :  3.28  
##  3rd Qu.: 79183                      3rd Qu.:5.500e+05   3rd Qu.:  4.00  
##  Max.   :110142                      Max.   :2.147e+09   Max.   :473.00  
##  NA's   :4533                        NA's   :1541        NA's   :481317  
##       bath           acre_lot             street            city          
##  Min.   :  1.0    Min.   :     0.00   Min.   :      0   Length:2226382    
##  1st Qu.:  2.0    1st Qu.:     0.15   1st Qu.: 506313   Class :character  
##  Median :  2.0    Median :     0.26   Median :1012766   Mode  :character  
##  Mean   :  2.5    Mean   :    15.22   Mean   :1012325                     
##  3rd Qu.:  3.0    3rd Qu.:     0.98   3rd Qu.:1521173                     
##  Max.   :830.0    Max.   :100000.00   Max.   :2001357                     
##  NA's   :511771   NA's   :325589      NA's   :10866                       
##     state             zip_code           house_size        prev_sold_date      
##  Length:2226382     Length:2226382     Min.   :4.000e+00   Min.   :1901-01-01  
##  Class :character   Class :character   1st Qu.:1.300e+03   1st Qu.:2016-08-09  
##  Mode  :character   Mode  :character   Median :1.760e+03   Median :2021-12-01  
##                                        Mean   :2.714e+03   Mean   :2017-08-16  
##                                        3rd Qu.:2.413e+03   3rd Qu.:2022-03-04  
##                                        Max.   :1.040e+09   Max.   :3019-04-02  
##                                        NA's   :568484      NA's   :734297

Missing Values

colSums(is.na(df))
##    brokered_by         status          price            bed           bath 
##           4533              0           1541         481317         511771 
##       acre_lot         street           city          state       zip_code 
##         325589          10866           1407              8            299 
##     house_size prev_sold_date 
##         568484         734297

1. Price Distribution

hist(df$price[df$price<1000000],
     main = "Price Distribution",
     xlab = "Price",
     col = "skyblue")

2. Listings per State

state_count <- table(df$state)
barplot(state_count,
        main = "Listings per State",
        col = "orange",
        las = 2)

️ 3. Price vs Bedrooms

scatter_data <- df[!is.na(df$bed) & !is.na(df$price), ]

plot(scatter_data$bed,
     scatter_data$price,
     main = "Scatter Plot: Price vs Bedrooms",
     xlab = "Bedrooms",
     ylab = "Price",
     col = "blue",
     pch = 19)

4. House Size vs Price

plot(df$house_size, df$price,
     main = "House Size vs Price",
     xlab = "House Size",
     ylab = "Price",
     pch = 16,
     col = rgb(0,0,1,0.3))
abline(lm(price ~ house_size, data = df), col = "red", lwd = 2)

5. Bathrooms vs Price

plot(df$bath, df$price,
     main = "Bathrooms vs Price",
     xlab = "Bathrooms",
     ylab = "Price",
     pch = 16,
     col = rgb(1,0,0,0.3))

6. Top 10 Expensive Houses

top10 <- head(df[order(-df$price), ], 10)
top10[, c("city", "state", "price")]
##                  city      state      price
## 221995  International California 2147483600
## 941988         Eureka     Kansas 1000000000
## 72904           Bronx   New York  875000000
## 1288465     San Diego California  515000000
## 2054445   Los Angeles California  295000000
## 375791        Atlanta    Georgia  281500000
## 1258957 Beverly Hills California  250000000
## 1051158         Kosse      Texas  212500000
## 105335  New York City   New York  169000000
## 1258585   Los Angeles California  165000000

7. Average Price by State

state_avg <- aggregate(price ~ state, data = df, mean)
head(state_avg[order(-state_avg$price), ])
##                   state     price
## 13               Hawaii 1240095.3
## 5            California 1029223.7
## 35             New York  943796.8
## 9  District of Columbia  882849.8
## 48                 Utah  881418.4
## 6              Colorado  832179.3

8. Correlation

cor(df[, c("price", "bed", "bath", "house_size")],
    use = "complete.obs")
##                 price       bed      bath house_size
## price      1.00000000 0.1149881 0.2019397 0.07585679
## bed        0.11498812 1.0000000 0.6656029 0.18708478
## bath       0.20193973 0.6656029 1.0000000 0.22573485
## house_size 0.07585679 0.1870848 0.2257349 1.00000000

9. Price Outliers

boxplot(df$price,
        main = "Price Outliers",
        col = "yellow")

10. Price Range Categories

df$price_range <- cut(df$price,
                     breaks = c(0, 200000, 500000, 1000000, Inf),
                     labels = c("Low", "Medium", "High", "Luxury"))

barplot(table(df$price_range),
        main = "Price Range Distribution",
        col = c("green", "blue", "orange", "red"))

11. Average Price by Bedrooms

aggregate(price ~ bed, data = df, mean)
##    bed       price
## 1    1    372766.0
## 2    2    403458.8
## 3    3    451811.0
## 4    4    656969.9
## 5    5   1073591.7
## 6    6   1605613.3
## 7    7   2397309.6
## 8    8   1893457.2
## 9    9   2543846.5
## 10  10   2602077.6
## 11  11   2994711.3
## 12  12   2609231.8
## 13  13   3118753.2
## 14  14   4234846.3
## 15  15   2465431.2
## 16  16   2453852.9
## 17  17   4596060.3
## 18  18   2565537.2
## 19  19   8043875.4
## 20  20   2957435.2
## 21  21   9500368.6
## 22  22   3670463.6
## 23  23   3963108.8
## 24  24   3053511.0
## 25  25   3095143.0
## 26  26   3331281.0
## 27  27   2493690.9
## 28  28   3496972.3
## 29  29   4981126.6
## 30  30   3883862.6
## 31  31   3785332.5
## 32  32   3640240.8
## 33  33   3462426.5
## 34  34   4650714.3
## 35  35   5974362.5
## 36  36   4436903.8
## 37  37   6578333.3
## 38  38   3169222.2
## 39  39   5116666.7
## 40  40   6176787.5
## 41  41   2957800.0
## 42  42   5013205.2
## 43  43   7189499.8
## 44  44   4814285.7
## 45  45  11331666.7
## 46  46   7869166.7
## 47  47   3251166.7
## 48  48   4101153.8
## 49  49   8658333.3
## 50  50   3324750.0
## 51  52   7192500.0
## 52  53   6900000.0
## 53  54   7762125.0
## 54  55   7190000.0
## 55  56  18430000.0
## 56  57   8000000.0
## 57  60   7650000.0
## 58  61   9300000.0
## 59  62   6600000.0
## 60  63   6581666.7
## 61  64   3812362.5
## 62  66   6949975.0
## 63  67   1300000.0
## 64  68   8475000.0
## 65  69   3500000.0
## 66  70  11520000.0
## 67  71   4150000.0
## 68  72   7405714.1
## 69  73  23000000.0
## 70  74   3457812.5
## 71  75   1890000.0
## 72  76   2350000.0
## 73  80   4333333.3
## 74  82   3250000.0
## 75  84  24500000.0
## 76  86  15150000.0
## 77  88  17261000.0
## 78  90  11501950.0
## 79  93  10890000.0
## 80  96   7900000.0
## 81  98   1499000.0
## 82  99  10811000.0
## 83 100   4000000.0
## 84 102   1499000.0
## 85 108  13500000.0
## 86 110   2100000.0
## 87 111  24898000.0
## 88 114   2499000.0
## 89 120   7000000.0
## 90 123 120000000.0
## 91 136   5500000.0
## 92 142   2100000.0
## 93 148    549000.0
## 94 190    189000.0
## 95 210    575000.0
## 96 212   1695000.0
## 97 222    699000.0
## 98 444    435000.0
## 99 473    225000.0

12. Regression Plot (ggplot)

ggplot(df, aes(x = house_size, y = price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", color = "red") +
  ggtitle("House Size vs Price with Regression")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 569415 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 569415 rows containing missing values or values outside the scale range
## (`geom_point()`).

13. Listings by Status

barplot(table(df$status),
        main = "Listings by Status",
        col = c("purple", "green", "orange"))

14. Bathrooms Distribution

hist(df$bath,
     main = "Bathrooms Distribution",
     xlab = "Bathrooms",
     col = "pink")

️ 15. Highest Price per City

city_max <- aggregate(price ~ city, data = df, max)
head(city_max[order(-city_max$price), ])
##                city      price
## 8493  International 2147483600
## 5662         Eureka 1000000000
## 2102          Bronx  875000000
## 15818     San Diego  515000000
## 10270   Los Angeles  295000000
## 702         Atlanta  281500000

16. Bedrooms vs Bathrooms

plot(df$bed, df$bath,
     main = "Bedrooms vs Bathrooms",
     xlab = "Bedrooms",
     ylab = "Bathrooms",
     col = "darkblue")

17. Linear Regression Model

model <- lm(price ~ house_size + bed + bath, data = df)
summary(model)
## 
## Call:
## lm(formula = price ~ house_size + bed + bath, data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -160774168    -289796    -112516      86862 2147028430 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.482e+05  4.314e+03  -34.36   <2e-16 ***
## house_size   1.882e+01  4.481e-01   42.00   <2e-16 ***
## bed         -5.746e+04  1.606e+03  -35.79   <2e-16 ***
## bath         3.508e+05  1.673e+03  209.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2174000 on 1605505 degrees of freedom
##   (620873 observations deleted due to missingness)
## Multiple R-squared:  0.04251,    Adjusted R-squared:  0.04251 
## F-statistic: 2.376e+04 on 3 and 1605505 DF,  p-value: < 2.2e-16

18. Residual Plot

plot(model$fitted.values, model$residuals,
     main = "Residual Plot",
     xlab = "Fitted Values",
     ylab = "Residuals",
     col = "blue")
abline(h = 0, col = "red")

19. Price by State

boxplot(price ~ state,
        data = df,
        main = "Price by State",
        col = "lightblue",
        las = 2)

20. Density Plot

plot(density(df$price, na.rm = TRUE),
     main = "Price Density",
     col = "darkgreen")

21. Correlation Heatmap

corr_matrix <- cor(df[, c("price", "bed", "bath", "house_size")],
                   use = "complete.obs")

image(1:ncol(corr_matrix), 1:nrow(corr_matrix), corr_matrix,
      axes = FALSE, col = heat.colors(10))

axis(1, 1:ncol(corr_matrix), colnames(corr_matrix))
axis(2, 1:nrow(corr_matrix), rownames(corr_matrix))

22. Listings by Bedrooms

bed_count <- table(df$bed)
plot(bed_count,
     type = "b",
     main = "Listings by Bedrooms",
     col = "purple")

23. Price by Status

boxplot(price ~ status,
        data = df,
        main = "Price by Status",
        col = "orange")

24. Log Price Distribution

hist(log(df$price),
     main = "Log Price Distribution",
     col = "cyan")

25. Size Category vs Price

df$size_cat <- cut(df$house_size,
                   breaks = c(0, 1000, 2000, 3000, Inf),
                   labels = c("Small", "Medium", "Large", "Luxury"))

boxplot(price ~ size_cat,
        data = df,
        main = "Price by Size Category",
        col = c("green", "blue", "orange", "red"))

26. Multi-variable Scatter Plot

ggplot(df, aes(x = house_size, y = price, color = factor(bed))) +
  geom_point(alpha = 0.5) +
  ggtitle("House Size vs Price (Bedrooms Colored)")
## Warning: Removed 569415 rows containing missing values or values outside the scale range
## (`geom_point()`).

27. Price vs Bathrooms Trend

bath_avg <- aggregate(price ~ bath, data = df, mean)

plot(bath_avg$bath, bath_avg$price,
     type = "l",
     main = "Avg Price by Bathrooms",
     xlab = "Bathrooms",
     ylab = "Price",
     col = "red")

## Q27 Linear Regression

model1 <- lm(price ~ house_size, data = df)
summary(model1)
## 
## Call:
## lm(formula = price ~ house_size, data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
##   -1138792    -346027    -206028      18872 2146902573 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.810e+05  1.705e+03 340.757   <2e-16 ***
## house_size  6.899e-04  2.109e-03   0.327    0.744    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2195000 on 1656965 degrees of freedom
##   (569415 observations deleted due to missingness)
## Multiple R-squared:  6.456e-08,  Adjusted R-squared:  -5.389e-07 
## F-statistic: 0.107 on 1 and 1656965 DF,  p-value: 0.7436

Interpretation: Size impact on price. ## Q29 Polynomial Regression

df <- as.data.frame(df)

# Remove rows with missing values in required columns
df <- na.omit(df[, c("price", "house_size", "bed", "bath")])
model3 <- lm(price ~ poly(house_size, 2), data = df)
summary(model3)
## 
## Call:
## lm(formula = price ~ poly(house_size, 2), data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
##  -49882638    -278715    -177113       4127 2147116015 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              580211       1731  335.11   <2e-16 ***
## poly(house_size, 2)1  213509056    2193857   97.32   <2e-16 ***
## poly(house_size, 2)2 -386342002    2193857 -176.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2194000 on 1605506 degrees of freedom
## Multiple R-squared:  0.0246, Adjusted R-squared:  0.02459 
## F-statistic: 2.024e+04 on 2 and 1605506 DF,  p-value: < 2.2e-16

Interpretation: Captures non-linear trend.

Q30 Regression Line

plot(df$house_size, df$price)
abline(lm(price ~ house_size, data = df), col = "red")

Interpretation: Model fit visualization.


Final Insights

cat("Key Insights:\n")
## Key Insights:
cat("- House size strongly affects price\n")
## - House size strongly affects price
cat("- Price distribution is right skewed\n")
## - Price distribution is right skewed
cat("- Bathrooms impact price more than bedrooms\n")
## - Bathrooms impact price more than bedrooms
cat("- Larger homes = higher price\n")
## - Larger homes = higher price
cat("- Regression model shows reasonable prediction\n")
## - Regression model shows reasonable prediction