```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

# Load required libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pwr)

# Read the dataset (replace with the correct path)
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)

# View dataset structure
str(ames)
## 'data.frame':    2930 obs. of  82 variables:
##  $ Order          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ PID            : int  526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
##  $ MS.SubClass    : int  20 20 20 20 60 60 120 120 120 60 ...
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  NA NA NA NA ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities      : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood   : chr  "NAmes" "NAmes" "NAmes" "NAmes" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" NA NA "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pool.QC        : chr  NA NA NA NA ...
##  $ Fence          : chr  NA "MnPrv" NA NA ...
##  $ Misc.Feature   : chr  NA NA "Gar2" NA ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Mo.Sold        : int  5 6 6 4 3 6 4 1 3 6 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...

The dataset contains 82 columns and 2,930 entries, including essential variables such as:

Next, I will proceed to set up two null hypotheses, calculate sample sizes, and conduct hypothesis testing. Let’s begin with Hypothesis 1 using the Neyman-Pearson framework.

Hypothesis 1: Neyman-Pearson Framework

# Set parameters for Neyman-Pearson framework
alpha <- 0.05         # Significance level
power <- 0.8          # Desired power (typically 0.8 or 80%)
effect_size <- 0.5    # Medium effect size (Cohen's d)

# Calculate sample size
sample_size <- pwr.t.test(d = effect_size, sig.level = alpha, power = power, type = "two.sample")$n
print(paste("Required sample size per group:", round(sample_size)))
## [1] "Required sample size per group: 64"
# Subset the data for two specific neighborhoods
group1 <- subset(ames, Neighborhood == "NAmes")$SalePrice
group2 <- subset(ames, Neighborhood == "CollgCr")$SalePrice

# Check if we have enough data
length(group1); length(group2)
## [1] 443
## [1] 267
# Perform a two-sample t-test
t_test_result <- t.test(group1, group2)

# Display the t-test result
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  group1 and group2
## t = -15.554, df = 378.66, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -63874.69 -49537.48
## sample estimates:
## mean of x mean of y 
##  145097.3  201803.4
# Interpretation
if(t_test_result$p.value <= alpha) {
  print("Reject the null hypothesis: Sale Prices are significantly different between the two neighborhoods.")
} else {
  print("Fail to reject the null hypothesis: No significant difference in Sale Prices between the two neighborhoods.")
}
## [1] "Reject the null hypothesis: Sale Prices are significantly different between the two neighborhoods."
# Visualization for Hypothesis 1: Boxplot
ggplot(ames %>% filter(Neighborhood %in% c("NAmes", "CollgCr")), aes(x = Neighborhood, y = SalePrice)) +
  geom_boxplot(fill = c("lightblue", "lightgreen")) +
  labs(title = "SalePrice Comparison: NAmes vs CollgCr", x = "Neighborhood", y = "SalePrice ($)") +
  theme_minimal()

Hypothesis 2: Fisher’s Significance Testing (Lot.Area and SalePrice)

# Perform Pearson's correlation test
correlation_result <- cor.test(ames$Gr.Liv.Area, ames$SalePrice)

# Display the correlation result
print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  ames$Gr.Liv.Area and ames$SalePrice
## t = 54.061, df = 2928, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6881814 0.7244502
## sample estimates:
##       cor 
## 0.7067799
# Interpretation based on p-value
alpha <- 0.05  # Set alpha level
if(correlation_result$p.value <= alpha) {
  print("Reject the null hypothesis: There is a significant correlation between Gr.Liv.Area and SalePrice.")
} else {
  print("Fail to reject the null hypothesis: No significant correlation between Gr.Liv.Area and SalePrice.")
}
## [1] "Reject the null hypothesis: There is a significant correlation between Gr.Liv.Area and SalePrice."
# Visualization: Scatter plot with regression line
ggplot(ames, aes(x = Gr.Liv.Area, y = SalePrice)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Correlation between Gr.Liv.Area and SalePrice", x = "Above Ground Living Area (sq ft)", y = "Sale Price ($)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Conclusion:

Hypothesis 1: Based on the data, if we found a p-value below 0.05, we can confidently say that there is a statistically significant difference in the sale prices of homes in the “NAmes” and “CollgCr” neighborhoods. If the p-value is greater than 0.05, we do not have sufficient evidence to claim that the sale prices differ significantly between these two neighborhoods.

Hypothesis 2: Based on the Pearson correlation test, if the p-value is smaller than the significance level (0.05), we reject the null hypothesis and conclude that there is a statistically significant linear relationship between the living area and the sale price in this dataset. This result suggests that larger homes tend to sell for higher prices.

The scatter plot with the regression line visually supports this conclusion, as we see a clear upward trend between Gr.Liv.Area and SalePrice, indicating a positive relationship.