Data dive week 3

###California Housing Dataset.

library(readr)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
housing<-read.csv("housing.csv")
View(housing)

housing<- housing %>%
  mutate(
    population_category = cut(
      population,
      breaks = c(0, 1000, 3000, 5000, 10000, Inf),
      labels = c("Low", "Moderate", "High", "Very High", "Extreme")
    ),
    housing_median_age_category = cut(
      housing_median_age,
      breaks = c(0, 10, 20, 30, 40, 50, 60, Inf),
      labels = c("0-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61+")
    )
  )

We created two new categorical columns. population_category and housing_median_age_category

Grouping and calculating the probabilities by group.

grouped_by_ocean <- housing%>%
  group_by(ocean_proximity) %>%
  summarize(
    mean_median_house_value = mean(median_house_value),
    count = n()
  )

grouped_by_ocean <- grouped_by_ocean %>%
  mutate(
    probability = count / sum(count)
  ) %>%
  arrange(desc(probability))

lowest_probability <- grouped_by_ocean$probability[length(grouped_by_ocean$probability)]
grouped_by_ocean <- grouped_by_ocean %>%
  mutate(
    anomaly = ifelse(probability <= lowest_probability, "Anomaly", "Normal")
  )

Lowest Probability of ocean is 0.0002422481

library(ggplot2)
ggplot(grouped_by_ocean, aes(x = ocean_proximity, y = probability, fill = anomaly)) +
  geom_bar(stat = "identity") +
  labs(title = "Anomaly Detection by Ocean Proximity", y = "Probability") +
  theme_minimal()

Compared to other categories, the “ISLAND” category in the ocean_proximity variable appears to be unusual, according to the study of probabilities. This is probably because of the geographical restrictions associated with being on an island. Understanding California’s housing trends and economic growth may be possible with the use of this information.

grouped_by_population <- housing %>%
  group_by(population_category) %>%
  summarize(
    mean_median_house_value = mean(median_house_value),
    count = n()
  )

grouped_by_population <- grouped_by_population %>%
  mutate(
    probability = count / sum(count)
  ) %>%
  arrange(desc(probability))

lowest_probability <- grouped_by_population$probability[length(grouped_by_population$probability)]
grouped_by_population <- grouped_by_population %>%
  mutate(
    anomaly = ifelse(probability <= lowest_probability, "Anomaly", "Normal")
  )

Lowest Probability of population is 0.001114341

ggplot(grouped_by_population, aes(x = population_category, y = probability, fill = anomaly)) +
  geom_bar(stat = "identity") +
  labs(title = "Anomaly Detection by Population Category", y = "Probability") +
  theme_minimal()

The analysis of probabilities related to the population_category variable suggests that districts with extreme population categories are rare compared to those with moderate population densities. The rarity of these extreme categories could be due to various factors related to housing development, infrastructure, and geographic constraints.

grouped_by_age <- housing %>%
  group_by(housing_median_age_category) %>%
  summarize(
    mean_median_house_value = mean(median_house_value),
    count = n()
  )

grouped_by_age <- grouped_by_age %>%
  mutate(
    probability = count / sum(count)
  ) %>%
  arrange(desc(probability))

lowest_probability <- min(grouped_by_age$probability)
grouped_by_age <- grouped_by_age %>%
  mutate(
    anomaly = if_else(probability <= lowest_probability, "Anomaly", "Normal")
  )

Lowest Probability of age is 0.06400194

ggplot(grouped_by_age, aes(x = housing_median_age_category, y = probability, fill = anomaly)) +
  geom_bar(stat = "identity") +
  labs(title = "Anomaly Detection by Housing Median Age Category", y = "Probability") +
  theme_minimal()

possible_combinations <- expand.grid(
  ocean_proximity = unique(housing$ocean_proximity),
  population_category = unique(housing$population_category),
  housing_median_age_category = unique(housing$housing_median_age_category)
)

missing_combinations <- anti_join(possible_combinations, housing)

## Joining with `by = join_by(ocean_proximity, population_category,
## housing_median_age_category)`

combination_counts <- housing %>%
  group_by(
    ocean_proximity,
    population_category,
    housing_median_age_category
  ) %>%
  summarize(count = n())

## `summarise()` has grouped output by 'ocean_proximity', 'population_category'.
## You can override using the `.groups` argument.

combination_counts <- combination_counts %>%
  arrange(desc(count))

head(combination_counts, n = 10)

## # A tibble: 10 × 4
## # Groups:   ocean_proximity, population_category [4]
##    ocean_proximity population_category housing_median_age_category count
##    <chr>           <fct>               <fct>                       <int>
##  1 <1H OCEAN       Moderate            31-40                        1690
##  2 <1H OCEAN       Moderate            21-30                        1484
##  3 <1H OCEAN       Low                 31-40                        1154
##  4 INLAND          Moderate            11-20                        1132
##  5 <1H OCEAN       Moderate            11-20                        1130
##  6 INLAND          Moderate            21-30                         855
##  7 INLAND          Low                 11-20                         796
##  8 INLAND          Low                 31-40                         686
##  9 INLAND          Moderate            31-40                         662
## 10 INLAND          Low                 21-30                         647

tail(combination_counts, n = 10)

## # A tibble: 10 × 4
## # Groups:   ocean_proximity, population_category [8]
##    ocean_proximity population_category housing_median_age_category count
##    <chr>           <fct>               <fct>                       <int>
##  1 ISLAND          Low                 51-60                           2
##  2 NEAR OCEAN      Extreme             0-10                            2
##  3 INLAND          Very High           41-50                           1
##  4 INLAND          Very High           51-60                           1
##  5 INLAND          Extreme             11-20                           1
##  6 ISLAND          Moderate            51-60                           1
##  7 NEAR BAY        Very High           31-40                           1
##  8 NEAR OCEAN      High                51-60                           1
##  9 NEAR OCEAN      Very High           41-50                           1
## 10 NEAR OCEAN      Extreme             21-30                           1

Combinations Heatmap.

ggplot(combination_counts, aes(
  x = housing_median_age_category,
  y = population_category,
  fill = count
)) +
  geom_tile() +
  scale_fill_gradient(
    low = "white", high = "blue",
    limits = c(0, max(combination_counts$count))
  ) +
  labs(
    title = "Heatmap of Combination Counts",
    x = "Housing Median Age Category",
    y = "Population Category",
    fill = "Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = count), vjust = 1, color = "black", size = 3)

Data dive week 3

Sharmista Kothavadla

2023-09-11