library(readr)
library(pwr)
library(ggplot2)
housing_data<-read.csv("/Users/sharmistaroy/Downloads/housing.csv")

View(housing_data)

For this hypothesis:

alpha <- 0.05 
power <- 0.80 
min_effect_size <- 0.02 #less effect size 

required_sample_size <- pwr.t.test(
  d = min_effect_size,
  sig.level = alpha,
  power = power,
  type = "two.sample"  # Specify a two-sample t-test
)

# Print the required sample size
print(required_sample_size)
## 
##      Two-sample t test power calculation 
## 
##               n = 39245.26
##               d = 0.02
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Dataset consists of 20639 observations.

Hypothesis 1: Median Household Income and House Value Correlation

correlation_result <- cor.test(housing_data$median_income, housing_data$median_house_value)


print("Pearson Correlation Test (Neyman-Pearson Style):")
## [1] "Pearson Correlation Test (Neyman-Pearson Style):"
print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  housing_data$median_income and housing_data$median_house_value
## t = 136.22, df = 20638, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6808236 0.6951920
## sample estimates:
##       cor 
## 0.6880752

The results of the Pearson Correlation Test provide strong evidence to reject the null hypothesis. The extremely low p-value suggests a significant correlation between median household income and median house value in California. The negative correlation coefficient indicates that as median household income increases, median house value tends to decrease. The confidence interval for the correlation coefficient also supports this finding, as it excludes zero.

fisher_z <- 0.5 * log((1 + correlation_result$estimate) / (1 - correlation_result$estimate))

df <- nrow(housing_data) - 2

p_value_fisher <- 2 * (1 - pnorm(abs(fisher_z)))

# Print the result
print("Fisher's Style Test for Significance (Fisher's Z-Test):")
## [1] "Fisher's Style Test for Significance (Fisher's Z-Test):"
print(p_value_fisher)
##       cor 
## 0.3985068

The calculated correlation coefficient (r) of approximately 0.3985 indicates a moderate positive correlation. This suggests that there is a statistically significant positive correlation between median household income and median house value.

alpha <- 0.05 
power <- 0.80 
min_effect_size <- 0.3 #moderate effect

required_sample_size <- pwr.t.test(
  d = min_effect_size,
  sig.level = alpha,
  power = power,
  type = "two.sample"  # Specify a two-sample t-test
)

# Print the required sample size
print(required_sample_size)
## 
##      Two-sample t test power calculation 
## 
##               n = 175.3847
##               d = 0.3
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Hypothesis 2: Proximity to the Ocean Affects House Prices

contingency_table <- table(housing_data$ocean_proximity, housing_data$median_house_value)

# Perform Chi-squared test
chi_squared_result <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Print the result
print("Chi-squared Test for Independence (Neyman-Pearson Style):")
## [1] "Chi-squared Test for Independence (Neyman-Pearson Style):"
print(chi_squared_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 20213, df = 15364, p-value < 2.2e-16

The results of the Chi-squared Test for Independence provide strong evidence to reject the null hypothesis. The extremely low p-value suggests a significant association between ocean proximity and median house prices in California.

library(stats)


anova_model <- lm(median_house_value ~ ocean_proximity, data = housing_data)

# Perform ANOVA
anova_result <- anova(anova_model)

# Print the ANOVA table
print("ANOVA Test for Significance:")
## [1] "ANOVA Test for Significance:"
print(anova_result)
## Analysis of Variance Table
## 
## Response: median_house_value
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## ocean_proximity     4 6.5437e+13 1.6359e+13  1612.1 < 2.2e-16 ***
## Residuals       20635 2.0939e+14 1.0148e+10                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The results of the Analysis of Variance (ANOVA) test provide strong evidence to reject the null hypothesis. The extremely low p-value suggests a significant difference in median house values among different categories of ocean proximity in California.

•Build two visualizations that best illustrate the results from the two pairs of hypothesis tests, one for each null hypothesis.

library(ggplot2)

# Create a scatterplot
ggplot(housing_data, aes(x = median_income, y = median_house_value)) +
  geom_point(aes(color = ifelse(correlation_result$p.value < 0.05, "Significant", "Not Significant"))) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Scatterplot of Median Income vs. Median House Value",
       x = "Median Income",
       y = "Median House Value")
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)



# Create a stacked bar chart
ggplot(housing_data, aes(x = ocean_proximity, y = median_house_value, fill = ifelse(chi_squared_result$p.value < 0.05, "Significant", "Not Significant"))) +
  geom_bar(stat = "identity") +
  labs(title = "Stacked Bar Chart of Ocean Proximity vs. Median House Prices",
       x = "Ocean Proximity",
       y = "House Value") +
  theme_minimal()