california-analysis.R

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

cali <- read_csv("cali.csv")

## Rows: 20640 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ocean_proximity
## dbl (9): longitude, latitude, housing_median_age, total_rooms, total_bedroom...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cali

## # A tibble: 20,640 × 10
##    longitude latitude housing_median_age total_rooms total_bedrooms population
##        <dbl>    <dbl>              <dbl>       <dbl>          <dbl>      <dbl>
##  1     -122.     37.9                 41         880            129        322
##  2     -122.     37.9                 21        7099           1106       2401
##  3     -122.     37.8                 52        1467            190        496
##  4     -122.     37.8                 52        1274            235        558
##  5     -122.     37.8                 52        1627            280        565
##  6     -122.     37.8                 52         919            213        413
##  7     -122.     37.8                 52        2535            489       1094
##  8     -122.     37.8                 52        3104            687       1157
##  9     -122.     37.8                 42        2555            665       1206
## 10     -122.     37.8                 52        3549            707       1551
## # ℹ 20,630 more rows
## # ℹ 4 more variables: households <dbl>, median_income <dbl>,
## #   median_house_value <dbl>, ocean_proximity <chr>

cali <- na.omit(cali)
cali

## # A tibble: 20,433 × 10
##    longitude latitude housing_median_age total_rooms total_bedrooms population
##        <dbl>    <dbl>              <dbl>       <dbl>          <dbl>      <dbl>
##  1     -122.     37.9                 41         880            129        322
##  2     -122.     37.9                 21        7099           1106       2401
##  3     -122.     37.8                 52        1467            190        496
##  4     -122.     37.8                 52        1274            235        558
##  5     -122.     37.8                 52        1627            280        565
##  6     -122.     37.8                 52         919            213        413
##  7     -122.     37.8                 52        2535            489       1094
##  8     -122.     37.8                 52        3104            687       1157
##  9     -122.     37.8                 42        2555            665       1206
## 10     -122.     37.8                 52        3549            707       1551
## # ℹ 20,423 more rows
## # ℹ 4 more variables: households <dbl>, median_income <dbl>,
## #   median_house_value <dbl>, ocean_proximity <chr>

colSums(cali=='')

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                  0                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

view(cali)

class(cali$population)

## [1] "numeric"

str(cali)

## tibble [20,433 × 10] (S3: tbl_df/tbl/data.frame)
##  $ longitude         : num [1:20433] -122 -122 -122 -122 -122 ...
##  $ latitude          : num [1:20433] 37.9 37.9 37.9 37.9 37.9 ...
##  $ housing_median_age: num [1:20433] 41 21 52 52 52 52 52 52 42 52 ...
##  $ total_rooms       : num [1:20433] 880 7099 1467 1274 1627 ...
##  $ total_bedrooms    : num [1:20433] 129 1106 190 235 280 ...
##  $ population        : num [1:20433] 322 2401 496 558 565 ...
##  $ households        : num [1:20433] 126 1138 177 219 259 ...
##  $ median_income     : num [1:20433] 8.33 8.3 7.26 5.64 3.85 ...
##  $ median_house_value: num [1:20433] 452600 358500 352100 341300 342200 ...
##  $ ocean_proximity   : chr [1:20433] "NEAR BAY" "NEAR BAY" "NEAR BAY" "NEAR BAY" ...
##  - attr(*, "na.action")= 'omit' Named int [1:207] 291 342 539 564 697 739 1098 1351 1457 1494 ...
##   ..- attr(*, "names")= chr [1:207] "291" "342" "539" "564" ...

#Proximity to Ocean and house value

cali$ocean_proximity <- as.factor(cali$ocean_proximity)

average_price_by_ocean_proximity <- cali %>%
  group_by(ocean_proximity) %>%
  summarise(avg_median_house_value = mean(median_house_value))
print(average_price_by_ocean_proximity)

## # A tibble: 5 × 2
##   ocean_proximity avg_median_house_value
##   <fct>                            <dbl>
## 1 <1H OCEAN                      240268.
## 2 INLAND                         124897.
## 3 ISLAND                         380440 
## 4 NEAR BAY                       259279.
## 5 NEAR OCEAN                     249042.

ggplot(average_price_by_ocean_proximity, aes(x = ocean_proximity, y = avg_median_house_value)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Median House Value by Ocean Proximity", 
       x = "Ocean Proximity", 
       y = "Average Median House Value") +
  theme_minimal()

ggplot(cali, aes(x = ocean_proximity, y = median_house_value / 1e6)) +  
  geom_boxplot() +
  scale_y_continuous(labels = scales::comma) +  
  labs(title = "Median House Value by Ocean Proximity", 
       x = "Ocean Proximity", 
       y = "Median House Value (in millions)") +
  theme_minimal()

#Median income and population density
cali$population_density <- cali$population / cali$total_rooms

correlation_analysis <- cali %>%
  group_by(ocean_proximity) %>%
  summarise(correlation = cor(median_income, population_density, use = "complete.obs"),
            .groups = 'drop')

print(correlation_analysis)

## # A tibble: 5 × 2
##   ocean_proximity correlation
##   <fct>                 <dbl>
## 1 <1H OCEAN           -0.498 
## 2 INLAND               0.0521
## 3 ISLAND              -0.452 
## 4 NEAR BAY            -0.329 
## 5 NEAR OCEAN          -0.0958

ggplot(cali, aes(x = median_income, y = population_density, color = ocean_proximity)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation Between Median Income and Population Density by Ocean Proximity",
       x = "Median Income",
       y = "Population Density") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#Age of house compared to market value
age_value_analysis <- cali %>%
  group_by(ocean_proximity, housing_median_age) %>%
  summarise(avg_median_house_value = mean(median_house_value),
            .groups = 'drop')

ggplot(age_value_analysis, aes(x = housing_median_age, y = 
                                 avg_median_house_value, group = ocean_proximity,
                               color = ocean_proximity)) +
  geom_line() +
  labs(title = "Impact of Housing Age on Market Value by Ocean Proximity",
       x = "Housing Median Age",
       y = "Average Median House Value") +
  theme_minimal()

#k-means clustering by housing properties

cali_scaled <- cali %>%
  select(median_income, median_house_value, population_density) %>% 
  scale()

set.seed(123)  
kmeans_result <- kmeans(cali_scaled, centers = 3)  

cali$cluster <- as.factor(kmeans_result$cluster)

ggplot(cali, aes(x = median_income, y = median_house_value, color = cluster)) +
  geom_point(alpha = 0.6) +
  geom_point(data = as.data.frame(kmeans_result$centers), aes(x = median_income, y = median_house_value), color = "red", size = 5, shape = 23) +
  labs(title = "Clustering of California Housing by Median Income and House Value", 
       x = "Median Income", y = "Median House Value") +
  theme_minimal()

# Decision tree predictors for median hosue values
cali$high_value <- as.factor(ifelse(cali$median_house_value > median(cali$median_house_value, na.rm = TRUE), "High", "Low"))

library(C50)
cali_tree <- C5.0(high_value ~ median_income + total_rooms + ocean_proximity, data = cali)

plot(cali_tree)

summary(cali_tree)

## 
## Call:
## C5.0.formula(formula = high_value ~ median_income + total_rooms
##  + ocean_proximity, data = cali)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 15 20:31:08 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 20433 cases (4 attributes) from undefined.data
## 
## Decision tree:
## 
## ocean_proximity = INLAND:
## :...median_income <= 4.1218: Low (5092/286)
## :   median_income > 4.1218:
## :   :...median_income <= 5.678: Low (1041/364)
## :       median_income > 5.678: High (363/66)
## ocean_proximity in {<1H OCEAN,ISLAND,NEAR BAY,NEAR OCEAN}:
## :...median_income > 4.1303: High (5954/504)
##     median_income <= 4.1303:
##     :...median_income <= 2.8326: Low (3552/1073)
##         median_income > 2.8326:
##         :...total_rooms > 2032: High (2355/734)
##             total_rooms <= 2032:
##             :...median_income <= 3.2219: Low (651/290)
##                 median_income > 3.2219: High (1425/590)
## 
## 
## Evaluation on training data (20433 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       8 3907(19.1%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    8203  2013    (a): class High
##    1894  8323    (b): class Low
## 
## 
##  Attribute usage:
## 
##  100.00% median_income
##  100.00% ocean_proximity
##   21.69% total_rooms
## 
## 
## Time: 0.0 secs

#housing attributes that frequently occur together in high value areas
library(arules)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

cali$income_level <- cut(cali$median_income, breaks = 3, labels = c("Low", "Medium", "High"))
cali$age_group <- cut(cali$housing_median_age, breaks = 3, labels = c("Young", "Middle-aged", "Old"))
cali$ocean_proximity <- as.factor(cali$ocean_proximity)

trans_data <- cali[, c("income_level", "age_group", "ocean_proximity")]

trans <- as(trans_data, "transactions")

rules <- apriori(trans, parameter = list(supp = 0.05, conf = 0.8))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1021 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[11 item(s), 20433 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [10 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(sort(rules, by = "lift")[1:5])

##     lhs                         rhs                   support confidence   coverage     lift count
## [1] {age_group=Middle-aged,                                                                       
##      ocean_proximity=INLAND} => {income_level=Low} 0.12861547  0.9545950 0.13473303 1.151227  2628
## [2] {age_group=Old,                                                                               
##      ocean_proximity=INLAND} => {income_level=Low} 0.05740714  0.9490291 0.06049038 1.144515  1173
## [3] {ocean_proximity=INLAND} => {income_level=Low} 0.29393628  0.9245690 0.31791709 1.115016  6006
## [4] {age_group=Old}          => {income_level=Low} 0.27020017  0.8797004 0.30715020 1.060905  5521
## [5] {age_group=Young,                                                                             
##      ocean_proximity=INLAND} => {income_level=Low} 0.10791367  0.8795373 0.12269368 1.060709  2205

california-analysis.R

trevo

2025-01-15