###California Housing Dataset.
library(readr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
housing<-read.csv("housing.csv")
View(housing)
housing<- housing %>%
mutate(
population_category = cut(
population,
breaks = c(0, 1000, 3000, 5000, 10000, Inf),
labels = c("Low", "Moderate", "High", "Very High", "Extreme")
),
housing_median_age_category = cut(
housing_median_age,
breaks = c(0, 10, 20, 30, 40, 50, 60, Inf),
labels = c("0-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61+")
)
)
We created two new categorical columns. population_category and housing_median_age_category
Grouping and calculating the probabilities by group.
grouped_by_ocean <- housing%>%
group_by(ocean_proximity) %>%
summarize(
mean_median_house_value = mean(median_house_value),
count = n()
)
grouped_by_ocean <- grouped_by_ocean %>%
mutate(
probability = count / sum(count)
) %>%
arrange(desc(probability))
lowest_probability <- grouped_by_ocean$probability[length(grouped_by_ocean$probability)]
grouped_by_ocean <- grouped_by_ocean %>%
mutate(
anomaly = ifelse(probability <= lowest_probability, "Anomaly", "Normal")
)
Lowest Probability of ocean is 0.0002422481
library(ggplot2)
ggplot(grouped_by_ocean, aes(x = ocean_proximity, y = probability, fill = anomaly)) +
geom_bar(stat = "identity") +
labs(title = "Anomaly Detection by Ocean Proximity", y = "Probability") +
theme_minimal()
Compared to other categories, the “ISLAND” category in the ocean_proximity variable appears to be unusual, according to the study of probabilities. This is probably because of the geographical restrictions associated with being on an island. Understanding California’s housing trends and economic growth may be possible with the use of this information.
grouped_by_population <- housing %>%
group_by(population_category) %>%
summarize(
mean_median_house_value = mean(median_house_value),
count = n()
)
grouped_by_population <- grouped_by_population %>%
mutate(
probability = count / sum(count)
) %>%
arrange(desc(probability))
lowest_probability <- grouped_by_population$probability[length(grouped_by_population$probability)]
grouped_by_population <- grouped_by_population %>%
mutate(
anomaly = ifelse(probability <= lowest_probability, "Anomaly", "Normal")
)
Lowest Probability of population is 0.001114341
ggplot(grouped_by_population, aes(x = population_category, y = probability, fill = anomaly)) +
geom_bar(stat = "identity") +
labs(title = "Anomaly Detection by Population Category", y = "Probability") +
theme_minimal()
The analysis of probabilities related to the population_category
variable suggests that districts with extreme population categories are
rare compared to those with moderate population densities. The rarity of
these extreme categories could be due to various factors related to
housing development, infrastructure, and geographic constraints.
grouped_by_age <- housing %>%
group_by(housing_median_age_category) %>%
summarize(
mean_median_house_value = mean(median_house_value),
count = n()
)
grouped_by_age <- grouped_by_age %>%
mutate(
probability = count / sum(count)
) %>%
arrange(desc(probability))
lowest_probability <- min(grouped_by_age$probability)
grouped_by_age <- grouped_by_age %>%
mutate(
anomaly = if_else(probability <= lowest_probability, "Anomaly", "Normal")
)
Lowest Probability of age is 0.06400194
ggplot(grouped_by_age, aes(x = housing_median_age_category, y = probability, fill = anomaly)) +
geom_bar(stat = "identity") +
labs(title = "Anomaly Detection by Housing Median Age Category", y = "Probability") +
theme_minimal()
possible_combinations <- expand.grid(
ocean_proximity = unique(housing$ocean_proximity),
population_category = unique(housing$population_category),
housing_median_age_category = unique(housing$housing_median_age_category)
)
missing_combinations <- anti_join(possible_combinations, housing)
## Joining with `by = join_by(ocean_proximity, population_category,
## housing_median_age_category)`
combination_counts <- housing %>%
group_by(
ocean_proximity,
population_category,
housing_median_age_category
) %>%
summarize(count = n())
## `summarise()` has grouped output by 'ocean_proximity', 'population_category'.
## You can override using the `.groups` argument.
combination_counts <- combination_counts %>%
arrange(desc(count))
head(combination_counts, n = 10)
## # A tibble: 10 × 4
## # Groups: ocean_proximity, population_category [4]
## ocean_proximity population_category housing_median_age_category count
## <chr> <fct> <fct> <int>
## 1 <1H OCEAN Moderate 31-40 1690
## 2 <1H OCEAN Moderate 21-30 1484
## 3 <1H OCEAN Low 31-40 1154
## 4 INLAND Moderate 11-20 1132
## 5 <1H OCEAN Moderate 11-20 1130
## 6 INLAND Moderate 21-30 855
## 7 INLAND Low 11-20 796
## 8 INLAND Low 31-40 686
## 9 INLAND Moderate 31-40 662
## 10 INLAND Low 21-30 647
tail(combination_counts, n = 10)
## # A tibble: 10 × 4
## # Groups: ocean_proximity, population_category [8]
## ocean_proximity population_category housing_median_age_category count
## <chr> <fct> <fct> <int>
## 1 ISLAND Low 51-60 2
## 2 NEAR OCEAN Extreme 0-10 2
## 3 INLAND Very High 41-50 1
## 4 INLAND Very High 51-60 1
## 5 INLAND Extreme 11-20 1
## 6 ISLAND Moderate 51-60 1
## 7 NEAR BAY Very High 31-40 1
## 8 NEAR OCEAN High 51-60 1
## 9 NEAR OCEAN Very High 41-50 1
## 10 NEAR OCEAN Extreme 21-30 1
Combinations Heatmap.
ggplot(combination_counts, aes(
x = housing_median_age_category,
y = population_category,
fill = count
)) +
geom_tile() +
scale_fill_gradient(
low = "white", high = "blue",
limits = c(0, max(combination_counts$count))
) +
labs(
title = "Heatmap of Combination Counts",
x = "Housing Median Age Category",
y = "Population Category",
fill = "Count"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(label = count), vjust = 1, color = "black", size = 3)