Statistical Analysis (I): Estimation & Testing Homework Assignment 2

library(car)
library("ggplot2")
# this code basically installed packages if not installed already and load the mentioned packages 
if (!require("pacman")) install.packages("pacman")

## Loading required package: pacman

pacman::p_load("moments","extRemes","stringi", "ggplot2")

#set working dir
setwd("D:\\Google Drive\\SA\\Assignment 2")
# read data from csv file
housepricedf = read.csv("houseprices.csv", header = TRUE)

Task 1.

1. Your friend claims that the average house price in this area is above $150K. Do you agree? Briefly explain what the p-values in these cases mean?(1mark)

Solution 1a : We formulate over null hypothesis that mu is <= $150k

# mu given
mu <- 150000
# read data from csv file
housepricedf = read.csv("houseprices.csv", header = TRUE)
# price
price <- housepricedf$Price

sampleMeanPrice <- round(mean(price))
# take length of any column
samplesize <- length(price)
# sample std deviation
sdprice <- sd(price)
# degree of freedom
df <- samplesize-1
# alpha value or significance level 
alpha_value <- 0.05

# calculate p-value, we are subtracting bcos we need to calculate right tail prob 
p_value_1a <- 1-pt((sampleMeanPrice-mu)*sqrt(samplesize)/sdprice, df)

## P-value is 2.68243205425733e-11

## Since p-value is less then alpha value we reject our null hypothesis and hence avg price of house may be greater than $150k

He also claims that the average living area is more than 1800 Sq. Ft. Do you agree with this? (Use a 5% significance level for both.). Briefly explain what the p-values in these cases mean? (1mark)

Solution 1b : We formulate over null hypothesis that living area mu is <= 1800 SFT

mu <- 1800
# Living are vector
livingarea <- housepricedf$Living.Area
# sd of living area
sdlivingarea <- sd (livingarea)
samplemeanLivingArea = mean(livingarea)
p_value_1b <- 1-pt((samplemeanLivingArea-mu)*sqrt(samplesize)/sdlivingarea, df)

## P-value is 0.356333895962788

## Since p-value is greater or equal to alpha value our null hypothesis is true and hence living area of house may not be greater than $1800 SFT

Task 2

Are the home prices higher for houses with fireplaces as compared to those without?

2a) Create side-by-side box plots of the house prices of the two groups and comment them. (2marks)

# subset of data frame where fire place is available
hp_fp <- subset(housepricedf, as.numeric(housepricedf$Fireplace) == 1)
# subset of data frame where fire place is not available
hp_nfp <- subset(housepricedf, as.numeric(housepricedf$Fireplace) == 0)

By looking box plots its look likes that claim that houses which are having fireplace seems to be more expensive.

2b) Formulate an appropriate hypothesis and test it in order to check the above claim. Assume that the population standard deviations of house prices in the two groups are equal. (1mark)

Solution 2b : Our null hypothesis that average price of houses does not depends upon fireplace, means mu(house with fireplace) <= mu(house without fireplace. As it is given Sd of both types of houses are same so it implies all given houses has same standard deviation.

sample_mean_fp <- mean(hp_fp$Price)
Sample_mean_nfp <- mean(hp_nfp$Price)
# since SD are same for both population group, we can take sd of pooled sd of both samples
temp1=(nrow(hp_fp)-1) * sd(hp_fp$Price) * sd(hp_fp$Price) + (nrow(hp_nfp)-1) * sd(hp_nfp$Price) * sd(hp_nfp$Price)
temp2 = nrow(hp_fp) + nrow(hp_nfp) -2
pool_sd = sqrt(temp1/temp2)
#degree of freedom in this case 
df <- nrow(hp_nfp) + nrow(hp_fp) - 2
# p value of houses which has fireplace
p_value <- pt((mean(hp_nfp$Price) - mean(hp_fp$Price))/(pool_sd * sqrt((1/nrow(hp_fp)) + (1/nrow(hp_fp)) )), df)

## p_value 4.33812960990951e-67 is less than 0.05% of significant level, we reject our null hypothesis and accepts claim that house prices with fire place are expensive as comapre to houses which has not.

Task 3

Any house aged more than 30 years is considered an “old” house. Your friend claims that old houses have larger lot sizes than new houses. Do you agree? Explain. Use a significance level of 5% for your test. Historical data suggests that old houses include some very large and some very small lot sizes but new houses are more homogeneous in their lot sizes. (2marks)

oldhouses <- subset(housepricedf,as.numeric(housepricedf$Age) <= 30)
newhouses <- subset(housepricedf,as.numeric(housepricedf$Age) > 30)
mean_size_oldhouses <- mean(oldhouses$Lot.Size)
mean_size_newhouses <- mean(newhouses$Lot.Size)
sd_oh <- sd(oldhouses$Lot.Size)
sd_nh <- sd(newhouses$Lot.Size)
# As per problem statement we have variance or sigma which is different for both of the samples
# our Null hypothesis where mu_nh - mu_oh <= 0
t.test(newhouses$Lot.Size,oldhouses$Lot.Size,var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  newhouses$Lot.Size and oldhouses$Lot.Size
## t = -0.59026, df = 610.28, p-value = 0.5552
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.13014310  0.06999065
## sample estimates:
## mean of x mean of y 
## 0.5481788 0.5782550

temp1 <- sd_oh * sd_oh / nrow(oldhouses) + sd_nh * sd_nh / nrow(newhouses)
temp1 <- temp1*temp1          
temp2 <- (sd_oh * sd_oh)/nrow(oldhouses) * (sd_oh * sd_oh)/nrow(oldhouses)/(nrow(oldhouses)-1) + (sd_nh * sd_nh)/nrow(newhouses) * (sd_nh * sd_nh)/nrow(newhouses)/(nrow(newhouses)-1)
df_3 <- temp1/temp2
p_value <- pt((mean_size_newhouses-mean_size_oldhouses)/sqrt(sd_oh*sd_oh/nrow(oldhouses) + sd_nh*sd_nh/nrow(newhouses)),df_3)

## p_value 0.277617477460418 is greater than 0.05% of significant level, we approve our null hypothesis and reject claim that old house has larger lot size comapre to new houses

Task 4

Based on the evidence available here, would you be willing to claim that fireplaces have become more fashionable? For simplicity, it is OK to compare only “new” houses and “old” houses. Use a significance level of 5% for your test. Use a significance level of 5% for your test. (1mark)

# num of old houses = 745, num of old houses with fireplace = 486, Proportion of old houses with fireplace
# = 0.65235
# num of new houses = 302, num of new houses with fireplace = 135, Proportion of new houses with fireplace
# = 0.44702
# Null hypothesis --> H(null) ---> pi(o) - pi(n) <= 0
p_oh_fp <-   nrow(subset(oldhouses,as.numeric(oldhouses$Fireplace) == 1))/nrow(oldhouses)
p_nh_fp <-   nrow(subset(newhouses,as.numeric(newhouses$Fireplace) == 1))/nrow(newhouses)
weighted_p <- (p_oh_fp * nrow(oldhouses) + p_nh_fp*nrow(newhouses))/ (nrow(newhouses) +nrow(oldhouses))
z = (p_oh_fp-p_nh_fp)/sqrt(weighted_p* (1-weighted_p)*((1/nrow(oldhouses)) + ((1/nrow(newhouses)) )))
p_value <- pnorm(z)

## p_value 0.999999999552521 is greater than 0.05% of significant level, we cannot reject our null hypothesis and reject claim that fireplaces in new house have become more fashionable

Task 5

Suppose that houses with 1-2 bedrooms are considered to be “Small Houses”, those with 3-4 are “Medium Houses” and 5-6 as “Big Houses”. Can we conclude that the prices of Small, Medium and Big houses are not the same, at 1% level of significance? (2marks)

# our null hypothesis in this case is mu_small = mu_med = mu_big
smallhouses <- subset(housepricedf, housepricedf$Bedrooms < 3)
smallhouses$size <- 'small'
#View(smallhouses)
medhouses <- subset(housepricedf, housepricedf$Bedrooms > 2 &  housepricedf$Bedrooms <= 4)
medhouses$size <- 'medium'
#View(medhouses)
bighouses <- subset(housepricedf, housepricedf$Bedrooms > 4)
bighouses$size <- 'big'
#View(bighouses)
temp <- rbind(smallhouses,medhouses)
housedataframe <- rbind(temp,bighouses)
#View(housedataframe)
anova <- aov(housedataframe$Price~housedataframe$size)
summary(anova)

##                       Df    Sum Sq   Mean Sq F value Pr(>F)    
## housedataframe$size    2 4.840e+11 2.420e+11   58.71 <2e-16 ***
## Residuals           1044 4.303e+12 4.122e+09                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## From anova Calulcation we have p_value which is close to 0 and hence less then 1% significance level, so rejecting our null hypothesis and conclude that size  that the prices of Small, Medium and Big houses are not the same