Load Packages

library(readr)
library(ggplot2)

Load Dataset

library(readr)

df <- read_csv("C:/Users/ujjwa/Downloads/data science/archive/realtor-data.zip.csv")

## Rows: 2226382 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): status, city, state, zip_code
## dbl  (7): brokered_by, price, bed, bath, acre_lot, street, house_size
## date (1): prev_sold_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df <- as.data.frame(df)

Dataset Overview

dim(df)

## [1] 2226382      12

head(df)

##   brokered_by   status  price bed bath acre_lot  street          city
## 1      103378 for_sale 105000   3    2     0.12 1962661      Adjuntas
## 2       52707 for_sale  80000   4    2     0.08 1902874      Adjuntas
## 3      103379 for_sale  67000   2    1     0.15 1404990    Juana Diaz
## 4       31239 for_sale 145000   4    2     0.10 1947675         Ponce
## 5       34632 for_sale  65000   6    2     0.05  331151      Mayaguez
## 6      103378 for_sale 179000   4    3     0.46 1850806 San Sebastian
##         state zip_code house_size prev_sold_date
## 1 Puerto Rico    00601        920           <NA>
## 2 Puerto Rico    00601       1527           <NA>
## 3 Puerto Rico    00795        748           <NA>
## 4 Puerto Rico    00731       1800           <NA>
## 5 Puerto Rico    00680         NA           <NA>
## 6 Puerto Rico    00612       2520           <NA>

summary(df)

##   brokered_by        status              price                bed        
##  Min.   :     0   Length:2226382     Min.   :0.000e+00   Min.   :  1.00  
##  1st Qu.: 23861   Class :character   1st Qu.:1.650e+05   1st Qu.:  3.00  
##  Median : 52884   Mode  :character   Median :3.250e+05   Median :  3.00  
##  Mean   : 52940                      Mean   :5.242e+05   Mean   :  3.28  
##  3rd Qu.: 79183                      3rd Qu.:5.500e+05   3rd Qu.:  4.00  
##  Max.   :110142                      Max.   :2.147e+09   Max.   :473.00  
##  NA's   :4533                        NA's   :1541        NA's   :481317  
##       bath           acre_lot             street            city          
##  Min.   :  1.0    Min.   :     0.00   Min.   :      0   Length:2226382    
##  1st Qu.:  2.0    1st Qu.:     0.15   1st Qu.: 506313   Class :character  
##  Median :  2.0    Median :     0.26   Median :1012766   Mode  :character  
##  Mean   :  2.5    Mean   :    15.22   Mean   :1012325                     
##  3rd Qu.:  3.0    3rd Qu.:     0.98   3rd Qu.:1521173                     
##  Max.   :830.0    Max.   :100000.00   Max.   :2001357                     
##  NA's   :511771   NA's   :325589      NA's   :10866                       
##     state             zip_code           house_size        prev_sold_date      
##  Length:2226382     Length:2226382     Min.   :4.000e+00   Min.   :1901-01-01  
##  Class :character   Class :character   1st Qu.:1.300e+03   1st Qu.:2016-08-09  
##  Mode  :character   Mode  :character   Median :1.760e+03   Median :2021-12-01  
##                                        Mean   :2.714e+03   Mean   :2017-08-16  
##                                        3rd Qu.:2.413e+03   3rd Qu.:2022-03-04  
##                                        Max.   :1.040e+09   Max.   :3019-04-02  
##                                        NA's   :568484      NA's   :734297

Missing Values

colSums(is.na(df))

##    brokered_by         status          price            bed           bath 
##           4533              0           1541         481317         511771 
##       acre_lot         street           city          state       zip_code 
##         325589          10866           1407              8            299 
##     house_size prev_sold_date 
##         568484         734297

1. Price Distribution

hist(df$price[df$price<1000000],
     main = "Price Distribution",
     xlab = "Price",
     col = "skyblue")

2. Listings per State

state_count <- table(df$state)
barplot(state_count,
        main = "Listings per State",
        col = "orange",
        las = 2)

️ 3. Price vs Bedrooms

scatter_data <- df[!is.na(df$bed) & !is.na(df$price), ]

plot(scatter_data$bed,
     scatter_data$price,
     main = "Scatter Plot: Price vs Bedrooms",
     xlab = "Bedrooms",
     ylab = "Price",
     col = "blue",
     pch = 19)

4. House Size vs Price

plot(df$house_size, df$price,
     main = "House Size vs Price",
     xlab = "House Size",
     ylab = "Price",
     pch = 16,
     col = rgb(0,0,1,0.3))
abline(lm(price ~ house_size, data = df), col = "red", lwd = 2)

5. Bathrooms vs Price

plot(df$bath, df$price,
     main = "Bathrooms vs Price",
     xlab = "Bathrooms",
     ylab = "Price",
     pch = 16,
     col = rgb(1,0,0,0.3))

6. Top 10 Expensive Houses

top10 <- head(df[order(-df$price), ], 10)
top10[, c("city", "state", "price")]

##                  city      state      price
## 221995  International California 2147483600
## 941988         Eureka     Kansas 1000000000
## 72904           Bronx   New York  875000000
## 1288465     San Diego California  515000000
## 2054445   Los Angeles California  295000000
## 375791        Atlanta    Georgia  281500000
## 1258957 Beverly Hills California  250000000
## 1051158         Kosse      Texas  212500000
## 105335  New York City   New York  169000000
## 1258585   Los Angeles California  165000000

7. Average Price by State

state_avg <- aggregate(price ~ state, data = df, mean)
head(state_avg[order(-state_avg$price), ])

##                   state     price
## 13               Hawaii 1240095.3
## 5            California 1029223.7
## 35             New York  943796.8
## 9  District of Columbia  882849.8
## 48                 Utah  881418.4
## 6              Colorado  832179.3

8. Correlation

cor(df[, c("price", "bed", "bath", "house_size")],
    use = "complete.obs")

##                 price       bed      bath house_size
## price      1.00000000 0.1149881 0.2019397 0.07585679
## bed        0.11498812 1.0000000 0.6656029 0.18708478
## bath       0.20193973 0.6656029 1.0000000 0.22573485
## house_size 0.07585679 0.1870848 0.2257349 1.00000000

9. Price Outliers

boxplot(df$price,
        main = "Price Outliers",
        col = "yellow")

10. Price Range Categories

df$price_range <- cut(df$price,
                     breaks = c(0, 200000, 500000, 1000000, Inf),
                     labels = c("Low", "Medium", "High", "Luxury"))

barplot(table(df$price_range),
        main = "Price Range Distribution",
        col = c("green", "blue", "orange", "red"))

11. Average Price by Bedrooms

aggregate(price ~ bed, data = df, mean)

##    bed       price
## 1    1    372766.0
## 2    2    403458.8
## 3    3    451811.0
## 4    4    656969.9
## 5    5   1073591.7
## 6    6   1605613.3
## 7    7   2397309.6
## 8    8   1893457.2
## 9    9   2543846.5
## 10  10   2602077.6
## 11  11   2994711.3
## 12  12   2609231.8
## 13  13   3118753.2
## 14  14   4234846.3
## 15  15   2465431.2
## 16  16   2453852.9
## 17  17   4596060.3
## 18  18   2565537.2
## 19  19   8043875.4
## 20  20   2957435.2
## 21  21   9500368.6
## 22  22   3670463.6
## 23  23   3963108.8
## 24  24   3053511.0
## 25  25   3095143.0
## 26  26   3331281.0
## 27  27   2493690.9
## 28  28   3496972.3
## 29  29   4981126.6
## 30  30   3883862.6
## 31  31   3785332.5
## 32  32   3640240.8
## 33  33   3462426.5
## 34  34   4650714.3
## 35  35   5974362.5
## 36  36   4436903.8
## 37  37   6578333.3
## 38  38   3169222.2
## 39  39   5116666.7
## 40  40   6176787.5
## 41  41   2957800.0
## 42  42   5013205.2
## 43  43   7189499.8
## 44  44   4814285.7
## 45  45  11331666.7
## 46  46   7869166.7
## 47  47   3251166.7
## 48  48   4101153.8
## 49  49   8658333.3
## 50  50   3324750.0
## 51  52   7192500.0
## 52  53   6900000.0
## 53  54   7762125.0
## 54  55   7190000.0
## 55  56  18430000.0
## 56  57   8000000.0
## 57  60   7650000.0
## 58  61   9300000.0
## 59  62   6600000.0
## 60  63   6581666.7
## 61  64   3812362.5
## 62  66   6949975.0
## 63  67   1300000.0
## 64  68   8475000.0
## 65  69   3500000.0
## 66  70  11520000.0
## 67  71   4150000.0
## 68  72   7405714.1
## 69  73  23000000.0
## 70  74   3457812.5
## 71  75   1890000.0
## 72  76   2350000.0
## 73  80   4333333.3
## 74  82   3250000.0
## 75  84  24500000.0
## 76  86  15150000.0
## 77  88  17261000.0
## 78  90  11501950.0
## 79  93  10890000.0
## 80  96   7900000.0
## 81  98   1499000.0
## 82  99  10811000.0
## 83 100   4000000.0
## 84 102   1499000.0
## 85 108  13500000.0
## 86 110   2100000.0
## 87 111  24898000.0
## 88 114   2499000.0
## 89 120   7000000.0
## 90 123 120000000.0
## 91 136   5500000.0
## 92 142   2100000.0
## 93 148    549000.0
## 94 190    189000.0
## 95 210    575000.0
## 96 212   1695000.0
## 97 222    699000.0
## 98 444    435000.0
## 99 473    225000.0

12. Regression Plot (ggplot)

ggplot(df, aes(x = house_size, y = price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", color = "red") +
  ggtitle("House Size vs Price with Regression")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 569415 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 569415 rows containing missing values or values outside the scale range
## (`geom_point()`).

13. Listings by Status

barplot(table(df$status),
        main = "Listings by Status",
        col = c("purple", "green", "orange"))

14. Bathrooms Distribution

hist(df$bath,
     main = "Bathrooms Distribution",
     xlab = "Bathrooms",
     col = "pink")

️ 15. Highest Price per City

city_max <- aggregate(price ~ city, data = df, max)
head(city_max[order(-city_max$price), ])

##                city      price
## 8493  International 2147483600
## 5662         Eureka 1000000000
## 2102          Bronx  875000000
## 15818     San Diego  515000000
## 10270   Los Angeles  295000000
## 702         Atlanta  281500000

16. Bedrooms vs Bathrooms

plot(df$bed, df$bath,
     main = "Bedrooms vs Bathrooms",
     xlab = "Bedrooms",
     ylab = "Bathrooms",
     col = "darkblue")

17. Linear Regression Model

model <- lm(price ~ house_size + bed + bath, data = df)
summary(model)

## 
## Call:
## lm(formula = price ~ house_size + bed + bath, data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -160774168    -289796    -112516      86862 2147028430 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.482e+05  4.314e+03  -34.36   <2e-16 ***
## house_size   1.882e+01  4.481e-01   42.00   <2e-16 ***
## bed         -5.746e+04  1.606e+03  -35.79   <2e-16 ***
## bath         3.508e+05  1.673e+03  209.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2174000 on 1605505 degrees of freedom
##   (620873 observations deleted due to missingness)
## Multiple R-squared:  0.04251,    Adjusted R-squared:  0.04251 
## F-statistic: 2.376e+04 on 3 and 1605505 DF,  p-value: < 2.2e-16

18. Residual Plot

plot(model$fitted.values, model$residuals,
     main = "Residual Plot",
     xlab = "Fitted Values",
     ylab = "Residuals",
     col = "blue")
abline(h = 0, col = "red")

19. Price by State

boxplot(price ~ state,
        data = df,
        main = "Price by State",
        col = "lightblue",
        las = 2)

20. Density Plot

plot(density(df$price, na.rm = TRUE),
     main = "Price Density",
     col = "darkgreen")

21. Correlation Heatmap

corr_matrix <- cor(df[, c("price", "bed", "bath", "house_size")],
                   use = "complete.obs")

image(1:ncol(corr_matrix), 1:nrow(corr_matrix), corr_matrix,
      axes = FALSE, col = heat.colors(10))

axis(1, 1:ncol(corr_matrix), colnames(corr_matrix))
axis(2, 1:nrow(corr_matrix), rownames(corr_matrix))

22. Listings by Bedrooms

bed_count <- table(df$bed)
plot(bed_count,
     type = "b",
     main = "Listings by Bedrooms",
     col = "purple")

23. Price by Status

boxplot(price ~ status,
        data = df,
        main = "Price by Status",
        col = "orange")

24. Log Price Distribution

hist(log(df$price),
     main = "Log Price Distribution",
     col = "cyan")

25. Size Category vs Price

df$size_cat <- cut(df$house_size,
                   breaks = c(0, 1000, 2000, 3000, Inf),
                   labels = c("Small", "Medium", "Large", "Luxury"))

boxplot(price ~ size_cat,
        data = df,
        main = "Price by Size Category",
        col = c("green", "blue", "orange", "red"))

26. Multi-variable Scatter Plot

ggplot(df, aes(x = house_size, y = price, color = factor(bed))) +
  geom_point(alpha = 0.5) +
  ggtitle("House Size vs Price (Bedrooms Colored)")

## Warning: Removed 569415 rows containing missing values or values outside the scale range
## (`geom_point()`).

27. Price vs Bathrooms Trend

bath_avg <- aggregate(price ~ bath, data = df, mean)

plot(bath_avg$bath, bath_avg$price,
     type = "l",
     main = "Avg Price by Bathrooms",
     xlab = "Bathrooms",
     ylab = "Price",
     col = "red")

## Q27 Linear Regression

model1 <- lm(price ~ house_size, data = df)
summary(model1)

## 
## Call:
## lm(formula = price ~ house_size, data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
##   -1138792    -346027    -206028      18872 2146902573 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.810e+05  1.705e+03 340.757   <2e-16 ***
## house_size  6.899e-04  2.109e-03   0.327    0.744    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2195000 on 1656965 degrees of freedom
##   (569415 observations deleted due to missingness)
## Multiple R-squared:  6.456e-08,  Adjusted R-squared:  -5.389e-07 
## F-statistic: 0.107 on 1 and 1656965 DF,  p-value: 0.7436

Interpretation: Size impact on price. ## Q29 Polynomial Regression

df <- as.data.frame(df)

# Remove rows with missing values in required columns
df <- na.omit(df[, c("price", "house_size", "bed", "bath")])
model3 <- lm(price ~ poly(house_size, 2), data = df)
summary(model3)

## 
## Call:
## lm(formula = price ~ poly(house_size, 2), data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
##  -49882638    -278715    -177113       4127 2147116015 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              580211       1731  335.11   <2e-16 ***
## poly(house_size, 2)1  213509056    2193857   97.32   <2e-16 ***
## poly(house_size, 2)2 -386342002    2193857 -176.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2194000 on 1605506 degrees of freedom
## Multiple R-squared:  0.0246, Adjusted R-squared:  0.02459 
## F-statistic: 2.024e+04 on 2 and 1605506 DF,  p-value: < 2.2e-16

Interpretation: Captures non-linear trend.

Q30 Regression Line

plot(df$house_size, df$price)
abline(lm(price ~ house_size, data = df), col = "red")

Interpretation: Model fit visualization.

Final Insights

cat("Key Insights:\n")

## Key Insights:

cat("- House size strongly affects price\n")

## - House size strongly affects price

cat("- Price distribution is right skewed\n")

## - Price distribution is right skewed

cat("- Bathrooms impact price more than bedrooms\n")

## - Bathrooms impact price more than bedrooms

cat("- Larger homes = higher price\n")

## - Larger homes = higher price

cat("- Regression model shows reasonable prediction\n")

## - Regression model shows reasonable prediction

Realtor Dataset Analysis

Ujjwal

Load Packages

Load Dataset

Dataset Overview

Missing Values

1. Price Distribution

2. Listings per State

️ 3. Price vs Bedrooms

4. House Size vs Price

5. Bathrooms vs Price

6. Top 10 Expensive Houses

7. Average Price by State

8. Correlation

9. Price Outliers

10. Price Range Categories

11. Average Price by Bedrooms

12. Regression Plot (ggplot)

13. Listings by Status

14. Bathrooms Distribution

️ 15. Highest Price per City

16. Bedrooms vs Bathrooms

17. Linear Regression Model

18. Residual Plot

19. Price by State

20. Density Plot

21. Correlation Heatmap

22. Listings by Bedrooms

23. Price by Status

24. Log Price Distribution

25. Size Category vs Price

26. Multi-variable Scatter Plot

27. Price vs Bathrooms Trend

Q30 Regression Line

Final Insights