library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
cali <- read_csv("cali.csv")
## Rows: 20640 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ocean_proximity
## dbl (9): longitude, latitude, housing_median_age, total_rooms, total_bedroom...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cali
## # A tibble: 20,640 × 10
## longitude latitude housing_median_age total_rooms total_bedrooms population
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -122. 37.9 41 880 129 322
## 2 -122. 37.9 21 7099 1106 2401
## 3 -122. 37.8 52 1467 190 496
## 4 -122. 37.8 52 1274 235 558
## 5 -122. 37.8 52 1627 280 565
## 6 -122. 37.8 52 919 213 413
## 7 -122. 37.8 52 2535 489 1094
## 8 -122. 37.8 52 3104 687 1157
## 9 -122. 37.8 42 2555 665 1206
## 10 -122. 37.8 52 3549 707 1551
## # ℹ 20,630 more rows
## # ℹ 4 more variables: households <dbl>, median_income <dbl>,
## # median_house_value <dbl>, ocean_proximity <chr>
cali <- na.omit(cali)
cali
## # A tibble: 20,433 × 10
## longitude latitude housing_median_age total_rooms total_bedrooms population
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -122. 37.9 41 880 129 322
## 2 -122. 37.9 21 7099 1106 2401
## 3 -122. 37.8 52 1467 190 496
## 4 -122. 37.8 52 1274 235 558
## 5 -122. 37.8 52 1627 280 565
## 6 -122. 37.8 52 919 213 413
## 7 -122. 37.8 52 2535 489 1094
## 8 -122. 37.8 52 3104 687 1157
## 9 -122. 37.8 42 2555 665 1206
## 10 -122. 37.8 52 3549 707 1551
## # ℹ 20,423 more rows
## # ℹ 4 more variables: households <dbl>, median_income <dbl>,
## # median_house_value <dbl>, ocean_proximity <chr>
colSums(cali=='')
## longitude latitude housing_median_age total_rooms
## 0 0 0 0
## total_bedrooms population households median_income
## 0 0 0 0
## median_house_value ocean_proximity
## 0 0
view(cali)
class(cali$population)
## [1] "numeric"
str(cali)
## tibble [20,433 × 10] (S3: tbl_df/tbl/data.frame)
## $ longitude : num [1:20433] -122 -122 -122 -122 -122 ...
## $ latitude : num [1:20433] 37.9 37.9 37.9 37.9 37.9 ...
## $ housing_median_age: num [1:20433] 41 21 52 52 52 52 52 52 42 52 ...
## $ total_rooms : num [1:20433] 880 7099 1467 1274 1627 ...
## $ total_bedrooms : num [1:20433] 129 1106 190 235 280 ...
## $ population : num [1:20433] 322 2401 496 558 565 ...
## $ households : num [1:20433] 126 1138 177 219 259 ...
## $ median_income : num [1:20433] 8.33 8.3 7.26 5.64 3.85 ...
## $ median_house_value: num [1:20433] 452600 358500 352100 341300 342200 ...
## $ ocean_proximity : chr [1:20433] "NEAR BAY" "NEAR BAY" "NEAR BAY" "NEAR BAY" ...
## - attr(*, "na.action")= 'omit' Named int [1:207] 291 342 539 564 697 739 1098 1351 1457 1494 ...
## ..- attr(*, "names")= chr [1:207] "291" "342" "539" "564" ...
#Proximity to Ocean and house value
cali$ocean_proximity <- as.factor(cali$ocean_proximity)
average_price_by_ocean_proximity <- cali %>%
group_by(ocean_proximity) %>%
summarise(avg_median_house_value = mean(median_house_value))
print(average_price_by_ocean_proximity)
## # A tibble: 5 × 2
## ocean_proximity avg_median_house_value
## <fct> <dbl>
## 1 <1H OCEAN 240268.
## 2 INLAND 124897.
## 3 ISLAND 380440
## 4 NEAR BAY 259279.
## 5 NEAR OCEAN 249042.
ggplot(average_price_by_ocean_proximity, aes(x = ocean_proximity, y = avg_median_house_value)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Average Median House Value by Ocean Proximity",
x = "Ocean Proximity",
y = "Average Median House Value") +
theme_minimal()

ggplot(cali, aes(x = ocean_proximity, y = median_house_value / 1e6)) +
geom_boxplot() +
scale_y_continuous(labels = scales::comma) +
labs(title = "Median House Value by Ocean Proximity",
x = "Ocean Proximity",
y = "Median House Value (in millions)") +
theme_minimal()

#Median income and population density
cali$population_density <- cali$population / cali$total_rooms
correlation_analysis <- cali %>%
group_by(ocean_proximity) %>%
summarise(correlation = cor(median_income, population_density, use = "complete.obs"),
.groups = 'drop')
print(correlation_analysis)
## # A tibble: 5 × 2
## ocean_proximity correlation
## <fct> <dbl>
## 1 <1H OCEAN -0.498
## 2 INLAND 0.0521
## 3 ISLAND -0.452
## 4 NEAR BAY -0.329
## 5 NEAR OCEAN -0.0958
ggplot(cali, aes(x = median_income, y = population_density, color = ocean_proximity)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Correlation Between Median Income and Population Density by Ocean Proximity",
x = "Median Income",
y = "Population Density") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#Age of house compared to market value
age_value_analysis <- cali %>%
group_by(ocean_proximity, housing_median_age) %>%
summarise(avg_median_house_value = mean(median_house_value),
.groups = 'drop')
ggplot(age_value_analysis, aes(x = housing_median_age, y =
avg_median_house_value, group = ocean_proximity,
color = ocean_proximity)) +
geom_line() +
labs(title = "Impact of Housing Age on Market Value by Ocean Proximity",
x = "Housing Median Age",
y = "Average Median House Value") +
theme_minimal()

#k-means clustering by housing properties
cali_scaled <- cali %>%
select(median_income, median_house_value, population_density) %>%
scale()
set.seed(123)
kmeans_result <- kmeans(cali_scaled, centers = 3)
cali$cluster <- as.factor(kmeans_result$cluster)
ggplot(cali, aes(x = median_income, y = median_house_value, color = cluster)) +
geom_point(alpha = 0.6) +
geom_point(data = as.data.frame(kmeans_result$centers), aes(x = median_income, y = median_house_value), color = "red", size = 5, shape = 23) +
labs(title = "Clustering of California Housing by Median Income and House Value",
x = "Median Income", y = "Median House Value") +
theme_minimal()

# Decision tree predictors for median hosue values
cali$high_value <- as.factor(ifelse(cali$median_house_value > median(cali$median_house_value, na.rm = TRUE), "High", "Low"))
library(C50)
cali_tree <- C5.0(high_value ~ median_income + total_rooms + ocean_proximity, data = cali)
plot(cali_tree)

summary(cali_tree)
##
## Call:
## C5.0.formula(formula = high_value ~ median_income + total_rooms
## + ocean_proximity, data = cali)
##
##
## C5.0 [Release 2.07 GPL Edition] Wed Jan 15 20:31:08 2025
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 20433 cases (4 attributes) from undefined.data
##
## Decision tree:
##
## ocean_proximity = INLAND:
## :...median_income <= 4.1218: Low (5092/286)
## : median_income > 4.1218:
## : :...median_income <= 5.678: Low (1041/364)
## : median_income > 5.678: High (363/66)
## ocean_proximity in {<1H OCEAN,ISLAND,NEAR BAY,NEAR OCEAN}:
## :...median_income > 4.1303: High (5954/504)
## median_income <= 4.1303:
## :...median_income <= 2.8326: Low (3552/1073)
## median_income > 2.8326:
## :...total_rooms > 2032: High (2355/734)
## total_rooms <= 2032:
## :...median_income <= 3.2219: Low (651/290)
## median_income > 3.2219: High (1425/590)
##
##
## Evaluation on training data (20433 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 8 3907(19.1%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 8203 2013 (a): class High
## 1894 8323 (b): class Low
##
##
## Attribute usage:
##
## 100.00% median_income
## 100.00% ocean_proximity
## 21.69% total_rooms
##
##
## Time: 0.0 secs
#housing attributes that frequently occur together in high value areas
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
cali$income_level <- cut(cali$median_income, breaks = 3, labels = c("Low", "Medium", "High"))
cali$age_group <- cut(cali$housing_median_age, breaks = 3, labels = c("Young", "Middle-aged", "Old"))
cali$ocean_proximity <- as.factor(cali$ocean_proximity)
trans_data <- cali[, c("income_level", "age_group", "ocean_proximity")]
trans <- as(trans_data, "transactions")
rules <- apriori(trans, parameter = list(supp = 0.05, conf = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 1021
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[11 item(s), 20433 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [10 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(sort(rules, by = "lift")[1:5])
## lhs rhs support confidence coverage lift count
## [1] {age_group=Middle-aged,
## ocean_proximity=INLAND} => {income_level=Low} 0.12861547 0.9545950 0.13473303 1.151227 2628
## [2] {age_group=Old,
## ocean_proximity=INLAND} => {income_level=Low} 0.05740714 0.9490291 0.06049038 1.144515 1173
## [3] {ocean_proximity=INLAND} => {income_level=Low} 0.29393628 0.9245690 0.31791709 1.115016 6006
## [4] {age_group=Old} => {income_level=Low} 0.27020017 0.8797004 0.30715020 1.060905 5521
## [5] {age_group=Young,
## ocean_proximity=INLAND} => {income_level=Low} 0.10791367 0.8795373 0.12269368 1.060709 2205