f = file.choose()
housing_prices = read.csv(f)
library(table1)
## Warning: package 'table1' was built under R version 4.2.3
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ river + price + age + industry + ptratio + low_ses | rooms, data = housing_prices)
| 4-room (N=15) |
5-room (N=27) |
6-room (N=254) |
7-room (N=76) |
Overall (N=372) |
|
|---|---|---|---|---|---|
| river | |||||
| No | 15 (100%) | 23 (85.2%) | 239 (94.1%) | 67 (88.2%) | 344 (92.5%) |
| Yes | 0 (0%) | 4 (14.8%) | 15 (5.9%) | 9 (11.8%) | 28 (7.5%) |
| price | |||||
| Mean (SD) | 17.3 (10.7) | 14.0 (5.05) | 18.9 (5.50) | 28.8 (11.7) | 20.5 (8.59) |
| Median [Min, Max] | 13.8 [7.00, 50.0] | 14.4 [5.00, 23.7] | 19.4 [5.00, 50.0] | 27.5 [7.50, 50.0] | 19.8 [5.00, 50.0] |
| age | |||||
| Mean (SD) | 93.5 (15.9) | 89.8 (20.4) | 75.6 (24.3) | 77.2 (21.1) | 77.7 (23.6) |
| Median [Min, Max] | 100 [37.8, 100] | 96.2 [9.80, 100] | 84.5 [6.00, 100] | 82.7 [2.90, 100] | 87.3 [2.90, 100] |
| industry | |||||
| Mean (SD) | 17.8 (2.23) | 17.7 (5.43) | 13.6 (6.16) | 11.1 (6.53) | 13.5 (6.32) |
| Median [Min, Max] | 18.1 [9.90, 19.6] | 18.1 [6.91, 27.7] | 13.9 [2.18, 27.7] | 9.90 [1.89, 19.6] | 18.1 [1.89, 27.7] |
| ptratio | |||||
| Mean (SD) | 19.3 (1.94) | 18.7 (2.31) | 19.3 (1.71) | 18.4 (1.81) | 19.1 (1.82) |
| Median [Min, Max] | 20.2 [14.7, 20.2] | 20.1 [14.7, 21.2] | 20.2 [14.7, 21.2] | 18.0 [14.7, 21.0] | 20.2 [14.7, 21.2] |
| low_ses | |||||
| Mean (SD) | 24.4 (11.4) | 23.3 (6.75) | 14.5 (5.37) | 9.14 (6.25) | 14.4 (7.15) |
| Median [Min, Max] | 29.3 [3.26, 38.0] | 24.0 [10.2, 34.4] | 14.1 [5.08, 34.0] | 6.79 [1.73, 25.8] | 13.6 [1.73, 38.0] |
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
p = ggplot(data = housing_prices, aes(x = price))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of housing prices") + theme_bw()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = housing_prices, aes(x = rooms, y = price, fill = rooms, col = rooms))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Rooms", y = "Housing Prices (USD)") + ggtitle("Housing Prices by Rooms") + theme_bw()
kruskal.test(price ~ rooms, data = housing_prices)
##
## Kruskal-Wallis rank sum test
##
## data: price by rooms
## Kruskal-Wallis chi-squared = 75.504, df = 3, p-value = 2.826e-16
price.rooms = aov(price ~ rooms, data = housing_prices)
summary(price.rooms)
## Df Sum Sq Mean Sq F value Pr(>F)
## rooms 3 7162 2387.2 43.49 <2e-16 ***
## Residuals 368 20202 54.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))
plot(price.rooms)
tukey.price.rooms = TukeyHSD(price.rooms)
tukey.price.rooms
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = price ~ rooms, data = housing_prices)
##
## $rooms
## diff lwr upr p adj
## 5-room-4-room -3.215556 -9.373321 2.942210 0.5331459
## 6-room-4-room 1.603780 -3.477109 6.684668 0.8475492
## 7-room-4-room 11.511053 6.108559 16.913546 0.0000004
## 6-room-5-room 4.819335 0.948716 8.689954 0.0077590
## 7-room-5-room 14.726608 10.442544 19.010672 0.0000000
## 7-room-6-room 9.907273 7.407162 12.407384 0.0000000